A leading credit card provider CredX receives thousands of applicants every year and, experiencing increased credit loss over the last few years. The CEO wants to mitigate the credit risk.
1. Business Objective(s)
1.1 Business Understanding
CredX intends to mitigate their credit risk during acquisition by 'Finding The Right Customers'.
1.2 Goals of Data Analysis
1.2.1. Using past data of the bank's applicants identify the most important factors affecting credit risk
1.2.2. Create strategies to mitigate the acquisition risk for new applications, by identifying right
customers using predictive modelling to differentiate Good Vs Bad customer
2. Data Understanding
3.1 Demographic Data
Contains customer-level information like Age, Gender, Marital Status and Salary etc.
3.2 Credit Bureau Data
This is taken from the credit bureau, contains past Avg Credit Card Utilization, Outstanding balance
and 30/60/90 DPDs in last 6/12 months etc.
3. Data Preparation
3.1 Data Cleaning
3.2 Data Imputation
3.3 Feature Engineering
3.3.1 Derived Variables
3.3.1 Encoding / Dummy variables
3.3.1 Weight-of-Evidence (WoE)/Information Value (IV) computation
4. Exploratory Data Analysis
4.1 Univariate Analysis
4.2 Bi-variate
4.3 Multi-variate Analysis
5. Feature selection
We will use Use Weight-of-Evidence(WoE) /Information Value(IV) for feature selection
4. Modeling Building
This is a binary classification problem with highly unbalanced data. We will apply following combination
of model types, sampling techniques & cross-validation
4.1 Demographic Data - Unbalanced Data - Logistic Regression
4.2 Demographic & Credit Bureau Data - Unbalanced Data - Logistic Regression
4.3 Demographic & Credit Bureau Data - Under Sampling - Logistic Regression - with Cross Validation
4.4 Demographic & Credit Bureau Data - Over Sampling - Logistic Regression - with Cross Validation
4.5 Demographic & Credit Bureau Data - SMOTE Sampling - Logistic Regression - with Cross Validation
4.6 Demographic & Credit Bureau Data - SMOTE Sampling - Decision Trees - with Cross Validation
4.7 Demographic & Credit Bureau Data - SMOTE Sampling - Random Forest - with Cross Validation
5. Models Evaluation using Metrics & Final Model Selection
5.1 Accuracy, Sensitivity & Specificity
5.2 F-Score (F1)
5.3 Area Under Curve (AUC)
5.4 KSStatistic
5.5 ROC Curve
5.6 Vintage Curve
6. Model Deployment
7. Application scorecard Building
8. Calculate scores on Rejected Population Data
9. Financial Benefit Analysis
# Loading Libraries
library(ggplot2)
library(dplyr)
library(outliers)
library(corrplot)
library(MASS)
library(caret)
library(ROSE)
library(car)
library(reshape2)
library(scales)
library(tidyr)
library(ROCR)
library(tibble)
# Common Functions
# For calculating Mode value of Categorical variables
ModeFunc <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
# Function to determine the outliers in a measure
checkForOutliersDetection <- function(dt, var) {
var_name <- eval(substitute(var),eval(dt))
na1 <- sum(is.na(var_name))
m1 <- mean(var_name, na.rm = T)
outlier <- boxplot.stats(var_name)$out
mo <- mean(outlier)
var_name <- ifelse(var_name %in% outlier, NA, var_name)
na2 <- sum(is.na(var_name))
cat("Outliers identified:", na2 - na1, "n")
}
# For plotting correlation matrix
plot_correlationMatrix <- function (data, features) {
melted_cor_matrix <- melt(round(cor(data [ names(data)
%in%
features],
use="complete.obs"),2))
ggplot(data = melted_cor_matrix, aes(x=Var1, y=Var2, fill=value, label=value)) +
geom_tile() +
geom_text() +
xlab('') +
ylab('') +
theme_minimal() +
theme(axis.text.x = element_text(size=10,
hjust=-0.08,
angle= -35 ))
}
# Loading Demographic Data
demographic_data.original <-read.csv("Demographic data.csv",
header = TRUE,
stringsAsFactors = FALSE)
# Loading Credit Bureau Data
creditbureau_data.original <- read.csv("Credit Bureau data.csv",
header = TRUE,
stringsAsFactors = FALSE)
# Data Cleaning
# 1. Remove Duplicate Records
# 2. Separate Rejected Applications from Data Analysis & Model Building
# 3. Remove Invalid / Incorrect Values Records
# 4. Missing Values Treatment
# 5. Outlier Treatment
# checking for Total row count
# 71295
nrow(demographic_data.original)
## [1] 71295
# 71295
nrow(creditbureau_data.original)
## [1] 71295
# Checking for Duplicate records
# 71292 - 3 Duplicate records exist
length(unique(demographic_data.original$Application.ID))
## [1] 71292
# 71292 - 3 Duplicate records exist
length(unique(creditbureau_data.original$Application.ID))
## [1] 71292
# Remove duplicate records
demographic_data <- demographic_data.original[!duplicated(demographic_data.original$Application.ID),]
creditbureau_data <- creditbureau_data.original[!duplicated(creditbureau_data.original$Application.ID),]
summary(demographic_data)
## Application.ID Age Gender
## Min. :1.004e+05 Min. :-3.00 Length:71292
## 1st Qu.:2.484e+08 1st Qu.:37.00 Class :character
## Median :4.976e+08 Median :45.00 Mode :character
## Mean :4.990e+08 Mean :44.94
## 3rd Qu.:7.496e+08 3rd Qu.:53.00
## Max. :1.000e+09 Max. :65.00
##
## Marital.Status..at.the.time.of.application. No.of.dependents
## Length:71292 Min. :1.000
## Class :character 1st Qu.:2.000
## Mode :character Median :3.000
## Mean :2.865
## 3rd Qu.:4.000
## Max. :5.000
## NA's :3
## Income Education Profession Type.of.residence
## Min. :-0.5 Length:71292 Length:71292 Length:71292
## 1st Qu.:14.0 Class :character Class :character Class :character
## Median :27.0 Mode :character Mode :character Mode :character
## Mean :27.2
## 3rd Qu.:40.0
## Max. :60.0
##
## No.of.months.in.current.residence No.of.months.in.current.company
## Min. : 6.00 Min. : 3.00
## 1st Qu.: 6.00 1st Qu.: 16.00
## Median : 11.00 Median : 34.00
## Mean : 34.56 Mean : 33.96
## 3rd Qu.: 60.00 3rd Qu.: 51.00
## Max. :126.00 Max. :133.00
##
## Performance.Tag
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.0422
## 3rd Qu.:0.0000
## Max. :1.0000
## NA's :1425
summary(creditbureau_data)
## Application.ID No.of.times.90.DPD.or.worse.in.last.6.months
## Min. :1.004e+05 Min. :0.0000
## 1st Qu.:2.484e+08 1st Qu.:0.0000
## Median :4.976e+08 Median :0.0000
## Mean :4.990e+08 Mean :0.2703
## 3rd Qu.:7.496e+08 3rd Qu.:0.0000
## Max. :1.000e+09 Max. :3.0000
##
## No.of.times.60.DPD.or.worse.in.last.6.months
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.4305
## 3rd Qu.:1.0000
## Max. :5.0000
##
## No.of.times.30.DPD.or.worse.in.last.6.months
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.5772
## 3rd Qu.:1.0000
## Max. :7.0000
##
## No.of.times.90.DPD.or.worse.in.last.12.months
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.4503
## 3rd Qu.:1.0000
## Max. :5.0000
##
## No.of.times.60.DPD.or.worse.in.last.12.months
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.6555
## 3rd Qu.:1.0000
## Max. :7.0000
##
## No.of.times.30.DPD.or.worse.in.last.12.months
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.8009
## 3rd Qu.:1.0000
## Max. :9.0000
##
## Avgas.CC.Utilization.in.last.12.months
## Min. : 0.0
## 1st Qu.: 8.0
## Median : 15.0
## Mean : 29.7
## 3rd Qu.: 46.0
## Max. :113.0
## NA's :1058
## No.of.trades.opened.in.last.6.months
## Min. : 0.000
## 1st Qu.: 1.000
## Median : 2.000
## Mean : 2.298
## 3rd Qu.: 3.000
## Max. :12.000
## NA's :1
## No.of.trades.opened.in.last.12.months
## Min. : 0.000
## 1st Qu.: 2.000
## Median : 5.000
## Mean : 5.827
## 3rd Qu.: 9.000
## Max. :28.000
##
## No.of.PL.trades.opened.in.last.6.months
## Min. :0.000
## 1st Qu.:0.000
## Median :1.000
## Mean :1.207
## 3rd Qu.:2.000
## Max. :6.000
##
## No.of.PL.trades.opened.in.last.12.months
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 2.000
## Mean : 2.397
## 3rd Qu.: 4.000
## Max. :12.000
##
## No.of.Inquiries.in.last.6.months..excluding.home...auto.loans.
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 1.000
## Mean : 1.764
## 3rd Qu.: 3.000
## Max. :10.000
##
## No.of.Inquiries.in.last.12.months..excluding.home...auto.loans.
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 3.000
## Mean : 3.535
## 3rd Qu.: 5.000
## Max. :20.000
##
## Presence.of.open.home.loan Outstanding.Balance Total.No.of.Trades
## Min. :0.0000 Min. : 0 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.: 211537 1st Qu.: 3.000
## Median :0.0000 Median : 774994 Median : 6.000
## Mean :0.2564 Mean :1249195 Mean : 8.187
## 3rd Qu.:1.0000 3rd Qu.:2920797 3rd Qu.:10.000
## Max. :1.0000 Max. :5218801 Max. :44.000
## NA's :272 NA's :272
## Presence.of.open.auto.loan Performance.Tag
## Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000
## Mean :0.08462 Mean :0.0422
## 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.0000
## NA's :1425
# Checking NA Values
sapply(demographic_data, function(x) sum(is.na(x) | is.null(x)))
## Application.ID
## 0
## Age
## 0
## Gender
## 0
## Marital.Status..at.the.time.of.application.
## 0
## No.of.dependents
## 3
## Income
## 0
## Education
## 0
## Profession
## 0
## Type.of.residence
## 0
## No.of.months.in.current.residence
## 0
## No.of.months.in.current.company
## 0
## Performance.Tag
## 1425
sapply(creditbureau_data, function(x) sum(is.na(x) | is.null(x)))
## Application.ID
## 0
## No.of.times.90.DPD.or.worse.in.last.6.months
## 0
## No.of.times.60.DPD.or.worse.in.last.6.months
## 0
## No.of.times.30.DPD.or.worse.in.last.6.months
## 0
## No.of.times.90.DPD.or.worse.in.last.12.months
## 0
## No.of.times.60.DPD.or.worse.in.last.12.months
## 0
## No.of.times.30.DPD.or.worse.in.last.12.months
## 0
## Avgas.CC.Utilization.in.last.12.months
## 1058
## No.of.trades.opened.in.last.6.months
## 1
## No.of.trades.opened.in.last.12.months
## 0
## No.of.PL.trades.opened.in.last.6.months
## 0
## No.of.PL.trades.opened.in.last.12.months
## 0
## No.of.Inquiries.in.last.6.months..excluding.home...auto.loans.
## 0
## No.of.Inquiries.in.last.12.months..excluding.home...auto.loans.
## 0
## Presence.of.open.home.loan
## 272
## Outstanding.Balance
## 272
## Total.No.of.Trades
## 0
## Presence.of.open.auto.loan
## 0
## Performance.Tag
## 1425
# Making the Performance Tag as a factor variable as new feature
demographic_data$Performance <- as.factor(demographic_data$Performance.Tag)
# Validate Performance Tag across Application IDs in both data-sets
# [1] Application.ID Performance.Tag
# <0 rows> (or 0-length row.names)
setdiff(dplyr::select(demographic_data, Application.ID, Performance.Tag),dplyr::select(creditbureau_data, Application.ID, Performance.Tag))
## [1] Application.ID Performance.Tag
## <0 rows> (or 0-length row.names)
customer_master_data <- merge(demographic_data, creditbureau_data, by="Application.ID")
View(customer_master_data)
# Remove Performance.Tag.x and Performance.Tag.y
customer_master_data <- customer_master_data[,-12]
customer_master_data <- customer_master_data[,-30]
# Check distribution of Classes and also NA values
# 0 1 NA's
#66920 2947 1425
summary(customer_master_data$Performance)
## 0 1 NA's
## 66920 2947 1425
# Also classes (1 and 0) distribution, is highly unbalanced
# [1] 1.9992
((1425/71276)*100)
## [1] 1.99927
# The records with Performance = NA are treated as Applications rejected.
# There are approximately 2% and we are ignoring them in the modeling.
# Separating 1425 records Performance = NA
rejected_records <- customer_master_data[which(is.na(customer_master_data$Performance)),]
# [1] 1425
nrow(rejected_records)
## [1] 1425
# Retaining records that have Performance = 1 or 0
customer_master_data <- customer_master_data[-which(is.na(customer_master_data$Performance)),]
# Data Quality Checks for Rejected Population
# [1] 69867
length(customer_master_data$Performance)
## [1] 69867
# [1] 1425
length(rejected_records$Application.ID)
## [1] 1425
# [1] 1425
# No Duplicate records in Rejected population
length(unique(rejected_records$Application.ID))
## [1] 1425
# No Records with Age <18
sort(unique(rejected_records$Age), decreasing = FALSE)
## [1] 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
## [24] 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
# [1] 0
sum(is.na(rejected_records$No.of.months.in.current.company))
## [1] 0
# [1] 0
sum(is.na(rejected_records$No.of.months.in.current.residence))
## [1] 0
# [1] 0
sum(is.na(rejected_records$No.of.times.30.DPD.or.worse.in.last.6.months))
## [1] 0
# [1] 0
sum(is.na(rejected_records$No.of.trades.opened.in.last.12.months))
## [1] 0
# [1] 1460
sum(is.na(rejected_records))
## [1] 1460
# [1] 1425
sum(is.na(rejected_records$Performance))
## [1] 1425
# Demographic Data contains 65 records with Age < 18 which is an invalid value
# 65
length(which(customer_master_data$Age < 18))
## [1] 65
# 20
length(which(customer_master_data$Age <= 0))
## [1] 20
# 65 records exists with Age < 18, with only 1 as defaulter
dplyr::select(customer_master_data[which(customer_master_data$Age < 18),], Age, Application.ID, Performance)
## Age Application.ID Performance
## 913 17 13167456 0
## 1111 17 15988053 1
## 6462 17 89770640 0
## 9315 16 130064793 0
## 9523 16 133256231 0
## 9634 16 134951209 0
## 10705 15 149528904 0
## 18401 15 256752828 0
## 22053 15 307736934 0
## 23024 17 321167182 0
## 23098 15 322139302 0
## 23371 0 325992471 0
## 23424 16 326621134 0
## 23512 17 327723491 0
## 23760 0 331033631 0
## 24313 17 339076142 0
## 26576 15 371030845 0
## 27205 0 380153306 0
## 27270 17 381342630 0
## 28787 17 403119279 0
## 29283 15 410153337 0
## 30880 17 431350709 0
## 31572 15 440605214 0
## 31606 0 441104387 0
## 33127 0 463622314 0
## 34569 17 483332934 0
## 35131 15 490791674 0
## 35289 0 492869740 0
## 36177 17 505345414 0
## 38231 16 534251279 0
## 38304 16 535285757 0
## 38974 16 544978628 0
## 39594 17 553384326 0
## 40638 17 568773679 0
## 41760 16 584758872 0
## 41815 0 585528536 0
## 42014 16 588417745 0
## 42155 0 590303560 0
## 43015 15 601969470 0
## 44439 17 622880517 0
## 45269 -3 634180637 0
## 47543 0 666398799 0
## 50066 15 701653071 0
## 50375 17 705529178 0
## 51200 16 717434244 0
## 53190 17 745462418 0
## 55160 17 773171568 0
## 55897 0 783195548 0
## 55957 17 783913343 0
## 56410 0 790296430 0
## 57503 17 805308826 0
## 57793 0 809393409 0
## 58025 17 812365997 0
## 60158 17 842745102 0
## 60649 0 848965588 0
## 60949 0 852875477 0
## 61664 0 862922520 0
## 64920 15 910506404 0
## 65339 0 915848904 0
## 65636 16 920083644 0
## 66567 0 932483616 0
## 68786 15 963813463 0
## 68833 0 964427213 0
## 69604 17 975085624 0
## 69820 0 977872820 0
# Removing records with Age < 18
customer_master_data <- customer_master_data[-which(customer_master_data$Age < 18),]
# [1] 69802
nrow(customer_master_data)
## [1] 69802
# Replacing missing values which are small in number, using simple and straight-forward techniques e.g. Mode etc
#
# No.of.trades.opened.in.last.6.months
# - Only 1 missing value exist
# - Make NA value as '0'
customer_master_data[which(is.na(customer_master_data$No.of.trades.opened.in.last.6.months)),
"No.of.trades.opened.in.last.6.months"] <- 0
# Verify values
length(which(is.na(customer_master_data$No.of.trades.opened.in.last.6.months)))
## [1] 0
# No.of.Dependents
# - Only 3 missing values exist
# - Make NA values for No.of.Dependents as '0'
customer_master_data [which(is.na(customer_master_data$No.of.dependents)),
"No.of.dependents"] <- 0
# Verify values
# [1] 0
length(which(is.na(customer_master_data$No.of.dependents)))
## [1] 0
# Gender
# - Only 2 missing values
# - Replacing with Mode value
summary(factor(customer_master_data$Gender))
## F M
## 1 16490 53311
customer_master_data[-which(customer_master_data$Gender %in% c("F","M")),
"Gender"] <- ModeFunc(customer_master_data$Gender)
# Marital Status
# Check for invalid i.e. NA values for Marital Status
nrow(customer_master_data[ -which(customer_master_data$Marital.Status..at.the.time.of.application.
%in%
c("Married","Single")),])
## [1] 5
# Imputing with Mode i.e. "Married"
customer_master_data[-which( customer_master_data$Marital.Status..at.the.time.of.application.
%in%
c("Married","Single")),
"Marital.Status..at.the.time.of.application."] <- ModeFunc(customer_master_data$Marital.Status..at.the.time.of.application.)
# Profession
# - Only 12 values are missing
# - Impute with Mode ()
summary(factor(customer_master_data$Profession))
## SAL SE SE_PROF
## 12 39639 13915 16236
# Imputing with Mode Value i.e. "SAL"
customer_master_data[-which(customer_master_data$Profession
%in%
c('SAL','SE','SE_PROF')),
"Profession"] <- ModeFunc(customer_master_data$Profession)
# Type of residence
# - 8 values are missing
# - Impute with Mode ()
summary(factor(customer_master_data$Type.of.residence))
## Company provided Living with Parents
## 8 1601 1767
## Others Owned Rented
## 198 13986 52242
# Imputing with Mode value i.e. "Rented"
customer_master_data[-which(customer_master_data$Type.of.residence
%in%
c('Company provided',
'Living with Parents',
'Others',
'Owned',
'Rented')),
"Type.of.residence"] <- ModeFunc(customer_master_data$Type.of.residence)
# Number of months in current residence
length(which(customer_master_data$No.of.months.in.current.residence <= 0))
## [1] 0
summary(customer_master_data$No.of.months.in.current.residence)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.00 6.00 10.00 34.57 61.00 126.00
# Number of months in current company
# imputing age less than or equal to 0
length(which(customer_master_data$No.of.months.in.current.company <= 0))
## [1] 0
summary(customer_master_data$No.of.months.in.current.company)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 17.00 34.00 34.19 51.00 133.00
# Education
# 118 records are with NA values - Need Imputation with WoE
nrow(customer_master_data[-which(customer_master_data$Education %in% c('Bachelor','Masters','Others','Phd','Professional')),])
## [1] 118
# Income
# 106 records are with NA values - Need Imputation with WoE
length(which(customer_master_data$Income <= 0))
## [1] 106
customer_master_data$Income_imputed <- customer_master_data$Income
customer_master_data[which(customer_master_data$Income_imputed <=0), "Income_imputed"] <- NA
rejected_records$Income_imputed <- rejected_records$Income
rejected_records[which(rejected_records$Income_imputed <=0), "Income_imputed"] <- NA
View(customer_master_data)
Note - Outliers removal is not required to perform on all measures, as it is not impacting any results Following are the variables with outliers.
Outstanding.Balance Income Avgas.CC.Utilization.in.last.12.months Total.No.of.Trades No.of.trades.opened.in.last.12.months No.of.Inquiries.in.last.12.months..excluding.home…auto.loans. No.of.PL.trades.opened.in.last.12.months
checkForOutliersDetection(customer_master_data, Outstanding.Balance) #function call
## Outliers identified: 0 n
# Outliers identified: 0
summary(customer_master_data$Outstanding.Balance)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 208494 774191 1253107 2925974 5218801 272
hist(customer_master_data$Outstanding.Balance, main = "Histogram of Outstanding.Balance")
# Income
# Outliers identified: 0
checkForOutliersDetection(customer_master_data, Income)
## Outliers identified: 0 n
hist(customer_master_data$Income, main = "Histogram of Income")
# Avgas.CC.Utilization.in.last.12.months
# Outliers identified: 3624
checkForOutliersDetection(customer_master_data, Avgas.CC.Utilization.in.last.12.months)
## Outliers identified: 3624 n
hist(customer_master_data$Avgas.CC.Utilization.in.last.12.months,
main = "Histogram of Avgas.CC.Utilization.in.last.12.months")
outlier_range<-1.5*IQR(customer_master_data$Avgas.CC.Utilization.in.last.12.months,
na.rm = T) #1843 outlier
upper_whisker=unname(quantile(customer_master_data$Avgas.CC.Utilization.in.last.12.months,
0.95,
na.rm = T))+outlier_range
lower_whisker=unname(quantile(customer_master_data$Avgas.CC.Utilization.in.last.12.months,
0.05,
na.rm = T))-outlier_range
customer_master_data2 <- customer_master_data[which(
(customer_master_data$Avgas.CC.Utilization.in.last.12.months>=upper_whisker |
customer_master_data$Avgas.CC.Utilization.in.last.12.months<=lower_whisker) ==FALSE),]
summary(customer_master_data$Performance)
## 0 1
## 66856 2946
summary(customer_master_data2$Performance)
## 0 1
## 65886 2898
# Total.No.of.Trades
# Outliers identified: 6818
checkForOutliersDetection(customer_master_data2, Total.No.of.Trades)
## Outliers identified: 6818 n
hist(customer_master_data2$Total.No.of.Trades, main = "Histogram of Total.No.of.Trades")
outlier_range<-1.5*IQR(customer_master_data2$Total.No.of.Trades, na.rm = T) #1843 outlier
upper_whisker=unname(quantile(customer_master_data2$Total.No.of.Trades,
0.95,
na.rm = T))+outlier_range
lower_whisker=unname(quantile(customer_master_data2$Total.No.of.Trades,
0.05,
na.rm = T))-outlier_range
customer_master_data3 <- customer_master_data2[which(
(customer_master_data2$Total.No.of.Trades>=upper_whisker |
customer_master_data2$Total.No.of.Trades<=lower_whisker)==FALSE),]
summary(customer_master_data3$Performance)
## 0 1
## 65779 2896
# Discarding the Outlier treatment as models are performing better with having Outliers
# It is not always required to remove outliers, as they carry on important patters & trends
#
# customer_master_data <- customer_master_data3
customer_master_data$Income_bin <- as.factor(cut(customer_master_data$Income,
breaks = c(-Inf,1,11,21,31,40,Inf),
labels=c("<0","1-10","11-20","21-30","31-40",">40"),
ordered = TRUE))
customer_master_data$age_bin <- as.factor(cut(customer_master_data$Age,
breaks=c(-Inf,35,46,55,Inf),
labels=c("<35","35-45","46-55",">55"), ordered = TRUE,
right = FALSE))
customer_master_data$current_residence_bin <- as.factor(cut(customer_master_data$No.of.months.in.current.residence,
breaks = c(-Inf,13,25,
37,49,61,
73,85,97,
109,121,Inf),
labels=c("<12","13-24",
"25-36","37-48",
"49-60","61-72",
"73-84","85-96",
"97-108","109-120",
">120"),
ordered = TRUE))
customer_master_data$current_company_bin <- as.factor(cut(customer_master_data$No.of.months.in.current.company,
breaks = c(1,12, 24, 36, 48,
60, 72,84,96,
108,120,136),
ordered = TRUE))
The data contains status of customer performance through variable Performance Tag with value 1 representing Default and 0 for Non-Default. We leverage R Information package for computing the Information Values (IV). But this package interprets 1 value for Good which is contradictory to business case here as Performance Tag value. So, we need another variable Performance Tag for IV with values 1 and 0 replaced with 0 and 1 respectively.
Reference table for Variable Importance Analysis based on Information Value(IV)
Information Value(IV) Predictive Power
<0.02 -> Useless for Prediction
0.02 - 0.1 -> Weak Predictor
0.1 - 0.3 -> Medium Predictor
0.3 - 0.5 -> Strong Predictor
>0.5 -> Suspecious
# Using WoE for both Variable Importance and also Missing Values
library(Information)
str(customer_master_data$Performance)
## Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
# Performance Tag for IV with values 1 and 0 replaced with 0 and 1 respectively
customer_master_data$Performance.Tag_forIV <- ifelse(customer_master_data$Performance == 0, 1, 0)
IV <- create_infotables(data=customer_master_data,
y="Performance.Tag_forIV",
bins = 10,
parallel = FALSE)
IV_Value = data.frame(IV$Summary)
# Printing values >=0.02
arrange(IV_Value [IV_Value$IV >=0.02, ], desc(IV))
## Variable
## 1 Avgas.CC.Utilization.in.last.12.months
## 2 No.of.trades.opened.in.last.12.months
## 3 No.of.PL.trades.opened.in.last.12.months
## 4 No.of.Inquiries.in.last.12.months..excluding.home...auto.loans.
## 5 Outstanding.Balance
## 6 No.of.times.30.DPD.or.worse.in.last.6.months
## 7 Total.No.of.Trades
## 8 No.of.PL.trades.opened.in.last.6.months
## 9 No.of.times.90.DPD.or.worse.in.last.12.months
## 10 No.of.times.60.DPD.or.worse.in.last.6.months
## 11 No.of.Inquiries.in.last.6.months..excluding.home...auto.loans.
## 12 No.of.times.30.DPD.or.worse.in.last.12.months
## 13 No.of.trades.opened.in.last.6.months
## 14 No.of.times.60.DPD.or.worse.in.last.12.months
## 15 No.of.times.90.DPD.or.worse.in.last.6.months
## 16 No.of.months.in.current.residence
## 17 current_residence_bin
## 18 Income_imputed
## 19 Income
## 20 Income_bin
## 21 No.of.months.in.current.company
## IV
## 1 0.31015860
## 2 0.29827712
## 3 0.29604052
## 4 0.29560438
## 5 0.24604217
## 6 0.24167952
## 7 0.23713984
## 8 0.21973098
## 9 0.21393775
## 10 0.20592613
## 11 0.20523987
## 12 0.19848248
## 13 0.18597050
## 14 0.18563797
## 15 0.16016368
## 16 0.07913990
## 17 0.06080075
## 18 0.04393392
## 19 0.04255551
## 20 0.04007508
## 21 0.02175181
# Two New columns are being added for every variable whose WoE analysis being done
# 1. Add WoE values for the variable as new feature (e.g. <variable>_WoE) and retain original feature/values in master data frame
# 2. For imputation to NA/Missing/Incorrect values BIN find other BIN/Bucket with nearest WoE value close enough,
# i.e. add new variable as <variable>_imputed
# 2a) For a continuous variable - when nearest match found, use median value of matching bucket
# 2b) For a continuous variable - when nearest match NOT found, use median for whole continuous variable
# 2c) For a categorical variable - when nearest match found, use Mode value of matching bucket
# 2d) For a categorical variable - when nearest match NOT found, use Mode value of whole categorical variable
#
# WoE Analysis for Education
Education_bin <- data.frame(IV$Tables$Education)
print(IV$Tables$Education)
## Education N Percent WOE IV
## 1 118 0.001690496 -0.004142828 2.906912e-08
## 2 Bachelor 17282 0.247586029 -0.016495934 6.791224e-05
## 3 Masters 23465 0.336165153 -0.008043166 8.973987e-05
## 4 Others 119 0.001704822 -0.492004075 6.089329e-04
## 5 Phd 4455 0.063823386 0.028258493 6.592444e-04
## 6 Professional 24363 0.349030114 0.017649959 7.671004e-04
plot_infotables(IV,"Education")
# Creating a column with WoE for Education
customer_master_data$Education_WoE <- ifelse(customer_master_data$Education=="Bachelor",
IV$Tables$Education$WOE[2],
ifelse(customer_master_data$Education=="Masters",
IV$Tables$Education$WOE[3],
ifelse(customer_master_data$Education=="Others",
IV$Tables$Education$WOE[4],
ifelse(customer_master_data$Education=="Phd",
IV$Tables$Education$WOE[5],
ifelse(customer_master_data$Education=="Professional",
IV$Tables$Education$WOE[6],
IV$Tables$Education$WOE[1])))))
unique(customer_master_data$Education_WoE)
## [1] 0.017649959 0.028258493 -0.008043166 -0.016495934 -0.004142828
## [6] -0.492004075
# Replace 'NA' (WoE = -0.004112913) values with Masters (nearest WoE = -0.008057761)
customer_master_data$Education_imputed <- ifelse(customer_master_data$Education == "",
"Masters",
customer_master_data$Education)
# WoE Analysis for Income
Income.bin <- data.frame(IV$Tables$Income_imputed)
print(Income.bin)
## Income_imputed N Percent WOE IV
## 1 NA 106 0.001518581 0.82915098 0.000726913
## 2 [1,5] 6222 0.089137847 -0.31412051 0.010901968
## 3 [6,10] 6508 0.093235151 -0.27545857 0.018939175
## 4 [11,16] 7916 0.113406493 -0.06639700 0.019454612
## 5 [17,21] 6795 0.097346781 -0.08141794 0.020124511
## 6 [22,26] 6821 0.097719263 -0.02551861 0.020188895
## 7 [27,31] 6807 0.097518696 -0.07956922 0.020829301
## 8 [32,36] 6820 0.097704937 0.15505200 0.023018462
## 9 [37,41] 6711 0.096143377 0.26716146 0.029100697
## 10 [42,48] 7780 0.111458124 0.17694798 0.032321235
## 11 [49,60] 7316 0.104810750 0.36098054 0.043933921
# # Replace '[-0.5,5]' (WoE = 0.30218631) values with median of [6,10] (WoE = -0.27545857)
# customer_master_data$Income_imputed <- ifelse(is.na(customer_master_data$Income_imputed),
# median(customer_master_data$Income_imputed, na.rm = TRUE),
# customer_master_data$Income_imputed)
# creating a Woe column for Income for master dataframe
customer_master_data$Income_imputed_WoE <- ifelse(is.na(customer_master_data$Income_imputed),
IV$Tables$Income_imputed$WOE[1],
ifelse(between(customer_master_data$Income_imputed,1,5),
IV$Tables$Income_imputed$WOE[2],
ifelse(between(customer_master_data$Income_imputed,6,10),
IV$Tables$Income_imputed$WOE[3],
ifelse(between(customer_master_data$Income_imputed,11,16),
IV$Tables$Income_imputed$WOE[4],
ifelse(between(customer_master_data$Income_imputed,17,21),
IV$Tables$Income_imputed$WOE[5],
ifelse(between(customer_master_data$Income_imputed,22,26),
IV$Tables$Income_imputed$WOE[6],
ifelse(between(customer_master_data$Income_imputed,27,31),
IV$Tables$Income_imputed$WOE[7],
ifelse(between(customer_master_data$Income_imputed,32,36),
IV$Tables$Income_imputed$WOE[8],
ifelse(between(customer_master_data$Income_imputed,37,41),
IV$Tables$Income_imputed$WOE[9],
ifelse(between(customer_master_data$Income_imputed,42,48),
IV$Tables$Income_imputed$WOE[10],
IV$Tables$Income_imputed$WOE[11]))))))))))
unique(customer_master_data$Income_imputed_WoE)
## [1] 0.15505200 -0.06639700 -0.07956922 0.36098054 0.17694798
## [6] -0.31412051 0.26716146 -0.27545857 -0.08141794 -0.02551861
## [11] 0.82915098
# [1] 0.15505200 -0.06639700 -0.07956922 0.36098054 0.17694798 -0.31412051 0.26716146 -0.27545857 -0.08141794 -0.02551861 0.82915098
# creating a Woe column for Income for rejected records
rejected_records$Income_imputed_WoE <- ifelse(is.na(rejected_records$Income_imputed),
IV$Tables$Income_imputed$WOE[1],
ifelse(between(rejected_records$Income_imputed,1,5),
IV$Tables$Income_imputed$WOE[2],
ifelse(between(rejected_records$Income_imputed,6,10),
IV$Tables$Income_imputed$WOE[3],
ifelse(between(rejected_records$Income_imputed,11,16),
IV$Tables$Income_imputed$WOE[4],
ifelse(between(rejected_records$Income_imputed,17,21),
IV$Tables$Income_imputed$WOE[5],
ifelse(between(rejected_records$Income_imputed,22,26),
IV$Tables$Income_imputed$WOE[6],
ifelse(between(rejected_records$Income_imputed,27,31),
IV$Tables$Income_imputed$WOE[7],
ifelse(between(rejected_records$Income_imputed,32,36),
IV$Tables$Income_imputed$WOE[8],
ifelse(between(rejected_records$Income_imputed,37,41),
IV$Tables$Income_imputed$WOE[9],
ifelse(between(rejected_records$Income_imputed,42,48),
IV$Tables$Income_imputed$WOE[10],
IV$Tables$Income_imputed$WOE[11]))))))))))
unique(rejected_records$Income_imputed_WoE)
## [1] -0.27545857 -0.02551861 -0.07956922 -0.08141794 -0.31412051
## [6] 0.26716146 0.36098054 -0.06639700 0.15505200 0.17694798
# Replace NA with median for master dataframe
customer_master_data[which(is.na(customer_master_data$Income_imputed)),
"Income_imputed"] <- median(customer_master_data$Income_imputed,
na.rm = TRUE)
# Replace NA with median for rejected records dataframe
rejected_records$Income_imputed <- rejected_records$Income
rejected_records[which(is.na(rejected_records$Income_imputed)),
"Income_imputed"] <- median(rejected_records$Income_imputed,
na.rm = TRUE)
plot_infotables(IV,"Income_imputed")
# WoE Analysis for Presence.of.open.home.loan
# Creating a WoE column for Presence.of.open.home.loan
Presence.of.open.home.loan.bin <- data.frame(IV$Tables$Presence.of.open.home.loan)
print(IV$Tables$Presence.of.open.home.loan)
## Presence.of.open.home.loan N Percent WOE IV
## 1 NA 272 0.003896736 0.37441483 0.0004617429
## 2 [0,0] 51487 0.737614968 -0.07384043 0.0046222731
## 3 [1,1] 18043 0.258488295 0.23737694 0.0177045058
plot_infotables(IV,"Presence.of.open.home.loan")
customer_master_data$Presence.of.open.home.loan_WoE <- ifelse(is.na(customer_master_data$Presence.of.open.home.loan),
IV$Tables$Presence.of.open.home.loan$WOE[1],
ifelse(customer_master_data$Presence.of.open.home.loan==1,
IV$Tables$Presence.of.open.home.loan$WOE[3],
IV$Tables$Presence.of.open.home.loan$WOE[2]))
# Replace 'NA' (WoE = 0.37444474) values with 1 (nearest WoE = 0.23740686)
customer_master_data$Presence.of.open.home.loan_imputed <- ifelse(is.na(customer_master_data$Presence.of.open.home.loan),
1,
customer_master_data$Presence.of.open.home.loan)
# WoE Analysis for Outstanding.Balance
# Creating a WoE column for Outstanding.Balance
Outstanding.Balance.bin <- data.frame(IV$Tables$Outstanding.Balance)
print(IV$Tables$Outstanding.Balance)
## Outstanding.Balance N Percent WOE IV
## 1 NA 272 0.003896736 0.3744148 0.0004617429
## 2 [0,6851] 6952 0.099596000 0.7700212 0.0425995861
## 3 [6852,25590] 6953 0.099610326 0.9199346 0.0991493847
## 4 [25600,386878] 6953 0.099610326 0.1340639 0.1008337943
## 5 [386879,585389] 6953 0.099610326 -0.2547105 0.1081048290
## 6 [585402,774181] 6953 0.099610326 -0.4513214 0.1331444733
## 7 [774188,972265] 6953 0.099610326 -0.4369762 0.1564590444
## 8 [972273,1357072] 6953 0.099610326 -0.4027883 0.1759510634
## 9 [1357076,2960907] 6953 0.099610326 0.3819991 0.1881965294
## 10 [2960909,3282409] 6953 0.099610326 0.8306179 0.2360178035
## 11 [3282457,5218801] 6954 0.099624653 -0.2961643 0.2460421728
plot_infotables(IV,"Outstanding.Balance")
customer_master_data$Outstanding.Balance_WoE <- ifelse(is.na(customer_master_data$Outstanding.Balance),
IV$Tables$Outstanding.Balance$WOE[1],
ifelse(customer_master_data$Outstanding.Balance <=6851,
IV$Tables$Outstanding.Balance$WOE[2],
ifelse(between(customer_master_data$Outstanding.Balance,6852,25590),
IV$Tables$Outstanding.Balance$WOE[3],
ifelse(between(customer_master_data$Outstanding.Balance,25600,386878),
IV$Tables$Outstanding.Balance$WOE[4],
ifelse(between(customer_master_data$Outstanding.Balance,386879,585389),
IV$Tables$Outstanding.Balance$WOE[5],
ifelse(between(customer_master_data$Outstanding.Balance,585402,774181),
IV$Tables$Outstanding.Balance$WOE[6],
ifelse(between(customer_master_data$Outstanding.Balance,774188,972265),
IV$Tables$Outstanding.Balance$WOE[7],
ifelse(between(customer_master_data$Outstanding.Balance,972273,1357072),
IV$Tables$Outstanding.Balance$WOE[8],
ifelse(between(customer_master_data$Outstanding.Balance,1357076,2960907),
IV$Tables$Outstanding.Balance$WOE[9],
ifelse(between(customer_master_data$Outstanding.Balance,2960909,3282409),
IV$Tables$Outstanding.Balance$WOE[10],
IV$Tables$Outstanding.Balance$WOE[11]))))))))))
unique(customer_master_data$Outstanding.Balance_WoE)
## [1] -0.2961643 -0.4513214 -0.4369762 0.1340639 -0.4027883 0.7700212
## [7] -0.2547105 0.8306179 0.3819991 0.9199346 0.3744148
# Calculate for rejected population
rejected_records$Outstanding.Balance_WoE <- ifelse(is.na(rejected_records$Outstanding.Balance),
IV$Tables$Outstanding.Balance$WOE[1],
ifelse(rejected_records$Outstanding.Balance <=6851,
IV$Tables$Outstanding.Balance$WOE[2],
ifelse(between(rejected_records$Outstanding.Balance,6852,25590),
IV$Tables$Outstanding.Balance$WOE[3],
ifelse(between(rejected_records$Outstanding.Balance,25600,386878),
IV$Tables$Outstanding.Balance$WOE[4],
ifelse(between(rejected_records$Outstanding.Balance,386879,585389),
IV$Tables$Outstanding.Balance$WOE[5],
ifelse(between(rejected_records$Outstanding.Balance,585402,774181),
IV$Tables$Outstanding.Balance$WOE[6],
ifelse(between(rejected_records$Outstanding.Balance,774188,972265),
IV$Tables$Outstanding.Balance$WOE[7],
ifelse(between(rejected_records$Outstanding.Balance,972273,1357072),
IV$Tables$Outstanding.Balance$WOE[8],
ifelse(between(rejected_records$Outstanding.Balance,1357076,2960907),
IV$Tables$Outstanding.Balance$WOE[9],
ifelse(between(rejected_records$Outstanding.Balance,2960909,3282409),
IV$Tables$Outstanding.Balance$WOE[10],
IV$Tables$Outstanding.Balance$WOE[11]))))))))))
unique(rejected_records$Outstanding.Balance_WoE)
## [1] -0.4027883 -0.4369762 -0.4513214 0.1340639 -0.2547105 -0.2961643
## [7] 0.8306179 0.3819991 0.9199346
# Replace 'NA' (WoE = 0.3744447) values with median of [1357118,2960907] (WoE = 0.3818808)
customer_master_data$Outstanding.Balance_imputed <- ifelse(is.na(customer_master_data$Outstanding.Balance),
median(filter(customer_master_data, Outstanding.Balance >=1357076 & Outstanding.Balance<=2960907) [, "Outstanding.Balance"]),
customer_master_data$Outstanding.Balance)
# No 'NA' values for Outstanding.Balance in Rejected population
sum(is.na(rejected_records$Outstanding.Balance))
## [1] 0
rejected_records$Outstanding.Balance_imputed <- rejected_records$Outstanding.Balance
# WoE Analysis for Avgas.CC.Utilization.in.last.12.months
print(IV$Tables$Avgas.CC.Utilization.in.last.12.months)
## Avgas.CC.Utilization.in.last.12.months N Percent WOE
## 1 NA 1018 0.01458411 -0.11599767
## 2 [0,4] 5521 0.07909515 0.80182190
## 3 [5,6] 5463 0.07826423 0.80062757
## 4 [7,8] 6856 0.09822068 0.79320847
## 5 [9,11] 9587 0.13734563 0.67681490
## 6 [12,14] 6585 0.09433827 0.46706002
## 7 [15,21] 6851 0.09814905 0.07916613
## 8 [22,37] 7116 0.10194550 -0.47533086
## 9 [38,51] 6742 0.09658749 -0.58461156
## 10 [52,71] 7016 0.10051288 -0.56326710
## 11 [72,113] 7047 0.10095699 -0.38102610
## IV
## 1 0.000206996
## 2 0.036016027
## 3 0.071360890
## 4 0.115034194
## 5 0.161716538
## 6 0.178421559
## 7 0.179014873
## 8 0.207765296
## 9 0.251161250
## 10 0.292660548
## 11 0.310158603
plot_infotables(IV,"Avgas.CC.Utilization.in.last.12.months")
customer_master_data$Avgas.CC.Utilization.in.last.12.months_WoE <- ifelse(is.na(customer_master_data$Avgas.CC.Utilization.in.last.12.months),
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[1],
ifelse(customer_master_data$Avgas.CC.Utilization.in.last.12.months<=4,
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[2],
ifelse(between(customer_master_data$Avgas.CC.Utilization.in.last.12.months,5,6),
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[3],
ifelse(between(customer_master_data$Avgas.CC.Utilization.in.last.12.months,7,8),
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[4],
ifelse(between(customer_master_data$Avgas.CC.Utilization.in.last.12.months,9,11),
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[5],
ifelse(between(customer_master_data$Avgas.CC.Utilization.in.last.12.months,12,14),
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[6],
ifelse(between(customer_master_data$Avgas.CC.Utilization.in.last.12.months,15,21),
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[7],
ifelse(between(customer_master_data$Avgas.CC.Utilization.in.last.12.months,22,37),
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[8],
ifelse(between(customer_master_data$Avgas.CC.Utilization.in.last.12.months,38,51),
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[9],
ifelse(between(customer_master_data$Avgas.CC.Utilization.in.last.12.months,52,71),
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[10],
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[11]))))))))))
unique(customer_master_data$Avgas.CC.Utilization.in.last.12.months_WoE)
## [1] -0.38102610 0.67681490 -0.47533086 0.07916613 0.80182190
## [6] 0.80062757 -0.58461156 0.46706002 -0.56326710 -0.11599767
## [11] 0.79320847
# Calculate for rejected population
rejected_records$Avgas.CC.Utilization.in.last.12.months_WoE <- ifelse(is.na(rejected_records$Avgas.CC.Utilization.in.last.12.months),
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[1],
ifelse(rejected_records$Avgas.CC.Utilization.in.last.12.months<=4,
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[2],
ifelse(between(rejected_records$Avgas.CC.Utilization.in.last.12.months,5,6),
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[3],
ifelse(between(rejected_records$Avgas.CC.Utilization.in.last.12.months,7,8),
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[4],
ifelse(between(rejected_records$Avgas.CC.Utilization.in.last.12.months,9,11),
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[5],
ifelse(between(rejected_records$Avgas.CC.Utilization.in.last.12.months,12,14),
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[6],
ifelse(between(rejected_records$Avgas.CC.Utilization.in.last.12.months,15,21),
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[7],
ifelse(between(rejected_records$Avgas.CC.Utilization.in.last.12.months,22,37),
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[8],
ifelse(between(rejected_records$Avgas.CC.Utilization.in.last.12.months,38,51),
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[9],
ifelse(between(rejected_records$Avgas.CC.Utilization.in.last.12.months,52,71),
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[10],
IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[11]))))))))))
unique(rejected_records$Avgas.CC.Utilization.in.last.12.months_WoE)
## [1] -0.56326710 -0.38102610 -0.58461156 -0.47533086 0.07916613
## [6] 0.46706002 -0.11599767 0.67681490 0.80062757 0.79320847
## [11] 0.80182190
# Replace 'NA' (WoE = -0.1159976) values with median of [72,113] (nearest WoE = -0.38102610)
customer_master_data$Avgas.CC.Utilization.in.last.12.months_imputed <- ifelse(is.na(customer_master_data$Avgas.CC.Utilization.in.last.12.months),
median(filter(customer_master_data, Avgas.CC.Utilization.in.last.12.months >=72 & Avgas.CC.Utilization.in.last.12.months<=113) [, "Avgas.CC.Utilization.in.last.12.months"]),
customer_master_data$Avgas.CC.Utilization.in.last.12.months)
# NA values exist for Avgas.CC.Utilization.in.last.12.months in rejected population
sum(is.na(rejected_records$Avgas.CC.Utilization.in.last.12.months))
## [1] 35
# [1] 35
rejected_records$Avgas.CC.Utilization.in.last.12.months_imputed <- ifelse(is.na(rejected_records$Avgas.CC.Utilization.in.last.12.months),
median(filter(rejected_records, Avgas.CC.Utilization.in.last.12.months >=72 & Avgas.CC.Utilization.in.last.12.months<=113) [, "Avgas.CC.Utilization.in.last.12.months"]),
rejected_records$Avgas.CC.Utilization.in.last.12.months)
sum(is.na(rejected_records$Avgas.CC.Utilization.in.last.12.months_imputed))
## [1] 0
# [1] 0
summary(customer_master_data)
## Application.ID Age Gender
## Min. :1.004e+05 Min. :18.00 Length:69802
## 1st Qu.:2.484e+08 1st Qu.:38.00 Class :character
## Median :4.979e+08 Median :45.00 Mode :character
## Mean :4.992e+08 Mean :45.03
## 3rd Qu.:7.498e+08 3rd Qu.:53.00
## Max. :1.000e+09 Max. :65.00
##
## Marital.Status..at.the.time.of.application. No.of.dependents
## Length:69802 Min. :0.00
## Class :character 1st Qu.:2.00
## Mode :character Median :3.00
## Mean :2.86
## 3rd Qu.:4.00
## Max. :5.00
##
## Income Education Profession Type.of.residence
## Min. :-0.50 Length:69802 Length:69802 Length:69802
## 1st Qu.:14.00 Class :character Class :character Class :character
## Median :27.00 Mode :character Mode :character Mode :character
## Mean :27.41
## 3rd Qu.:40.00
## Max. :60.00
##
## No.of.months.in.current.residence No.of.months.in.current.company
## Min. : 6.00 Min. : 3.00
## 1st Qu.: 6.00 1st Qu.: 17.00
## Median : 10.00 Median : 34.00
## Mean : 34.57 Mean : 34.19
## 3rd Qu.: 61.00 3rd Qu.: 51.00
## Max. :126.00 Max. :133.00
##
## Performance No.of.times.90.DPD.or.worse.in.last.6.months
## 0:66856 Min. :0.0000
## 1: 2946 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2491
## 3rd Qu.:0.0000
## Max. :3.0000
##
## No.of.times.60.DPD.or.worse.in.last.6.months
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3918
## 3rd Qu.:1.0000
## Max. :5.0000
##
## No.of.times.30.DPD.or.worse.in.last.6.months
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.5236
## 3rd Qu.:1.0000
## Max. :7.0000
##
## No.of.times.90.DPD.or.worse.in.last.12.months
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.415
## 3rd Qu.:1.000
## Max. :5.000
##
## No.of.times.60.DPD.or.worse.in.last.12.months
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.6035
## 3rd Qu.:1.0000
## Max. :7.0000
##
## No.of.times.30.DPD.or.worse.in.last.12.months
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.734
## 3rd Qu.:1.000
## Max. :9.000
##
## Avgas.CC.Utilization.in.last.12.months
## Min. : 0.00
## 1st Qu.: 8.00
## Median : 15.00
## Mean : 29.27
## 3rd Qu.: 45.00
## Max. :113.00
## NA's :1018
## No.of.trades.opened.in.last.6.months
## Min. : 0.000
## 1st Qu.: 1.000
## Median : 2.000
## Mean : 2.286
## 3rd Qu.: 3.000
## Max. :12.000
##
## No.of.trades.opened.in.last.12.months
## Min. : 0.000
## 1st Qu.: 2.000
## Median : 4.000
## Mean : 5.788
## 3rd Qu.: 9.000
## Max. :28.000
##
## No.of.PL.trades.opened.in.last.6.months
## Min. :0.00
## 1st Qu.:0.00
## Median :1.00
## Mean :1.19
## 3rd Qu.:2.00
## Max. :6.00
##
## No.of.PL.trades.opened.in.last.12.months
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 2.000
## Mean : 2.365
## 3rd Qu.: 4.000
## Max. :12.000
##
## No.of.Inquiries.in.last.6.months..excluding.home...auto.loans.
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 1.000
## Mean : 1.759
## 3rd Qu.: 3.000
## Max. :10.000
##
## No.of.Inquiries.in.last.12.months..excluding.home...auto.loans.
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 3.000
## Mean : 3.527
## 3rd Qu.: 5.000
## Max. :20.000
##
## Presence.of.open.home.loan Outstanding.Balance Total.No.of.Trades
## Min. :0.0000 Min. : 0 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.: 208494 1st Qu.: 3.000
## Median :0.0000 Median : 774191 Median : 6.000
## Mean :0.2595 Mean :1253107 Mean : 8.178
## 3rd Qu.:1.0000 3rd Qu.:2925974 3rd Qu.:10.000
## Max. :1.0000 Max. :5218801 Max. :44.000
## NA's :272 NA's :272
## Presence.of.open.auto.loan Income_imputed Income_bin age_bin
## Min. :0.00000 Min. : 1.00 <0 : 124 <35 :11361
## 1st Qu.:0.00000 1st Qu.:14.00 1-10 :14031 35-45:24740
## Median :0.00000 Median :27.00 11-20:13392 46-55:20446
## Mean :0.08487 Mean :27.45 21-30:13628 >55 :13255
## 3rd Qu.:0.00000 3rd Qu.:40.00 31-40:12179
## Max. :1.00000 Max. :60.00 >40 :16448
##
## current_residence_bin current_company_bin Performance.Tag_forIV
## <12 :36224 (1,12] :13480 Min. :0.0000
## 13-24 : 4368 (36,48]:12406 1st Qu.:1.0000
## 25-36 : 4157 (24,36]:12102 Median :1.0000
## 37-48 : 4060 (48,60]:12084 Mean :0.9578
## 49-60 : 3801 (12,24]:11864 3rd Qu.:1.0000
## 61-72 : 3463 (60,72]: 6544 Max. :1.0000
## (Other):13729 (Other): 1322
## Education_WoE Education_imputed Income_imputed_WoE
## Min. :-0.4920041 Length:69802 Min. :-0.31412
## 1st Qu.:-0.0080432 Class :character 1st Qu.:-0.08142
## Median :-0.0080432 Mode :character Median :-0.02552
## Mean : 0.0003301 Mean : 0.02026
## 3rd Qu.: 0.0176500 3rd Qu.: 0.17695
## Max. : 0.0282585 Max. : 0.82915
##
## Presence.of.open.home.loan_WoE Presence.of.open.home.loan_imputed
## Min. :-0.073840 Min. :0.0000
## 1st Qu.:-0.073840 1st Qu.:0.0000
## Median :-0.073840 Median :0.0000
## Mean : 0.008352 Mean :0.2624
## 3rd Qu.: 0.237377 3rd Qu.:1.0000
## Max. : 0.374415 Max. :1.0000
##
## Outstanding.Balance_WoE Outstanding.Balance_imputed
## Min. :-0.4513 Min. : 0
## 1st Qu.:-0.4028 1st Qu.: 209114
## Median : 0.1341 Median : 775596
## Mean : 0.1204 Mean :1259625
## 3rd Qu.: 0.7700 3rd Qu.:2925865
## Max. : 0.9199 Max. :5218801
##
## Avgas.CC.Utilization.in.last.12.months_WoE
## Min. :-0.58461
## 1st Qu.:-0.47533
## Median : 0.07917
## Mean : 0.14708
## 3rd Qu.: 0.79321
## Max. : 0.80182
##
## Avgas.CC.Utilization.in.last.12.months_imputed
## Min. : 0.00
## 1st Qu.: 8.00
## Median : 15.00
## Mean : 30.35
## 3rd Qu.: 46.00
## Max. :113.00
##
summary(rejected_records)
## Application.ID Age Gender
## Min. : 207075 Min. :22.00 Length:1425
## 1st Qu.:232979991 1st Qu.:34.00 Class :character
## Median :476559413 Median :41.00 Mode :character
## Mean :485902049 Mean :42.37
## 3rd Qu.:733733436 3rd Qu.:50.00
## Max. :997504566 Max. :65.00
##
## Marital.Status..at.the.time.of.application. No.of.dependents
## Length:1425 Min. :1.000
## Class :character 1st Qu.:3.000
## Mode :character Median :3.000
## Mean :3.141
## 3rd Qu.:4.000
## Max. :5.000
##
## Income Education Profession Type.of.residence
## Min. : 4.50 Length:1425 Length:1425 Length:1425
## 1st Qu.: 5.00 Class :character Class :character Class :character
## Median :11.00 Mode :character Mode :character Mode :character
## Mean :16.74
## 3rd Qu.:24.00
## Max. :60.00
##
## No.of.months.in.current.residence No.of.months.in.current.company
## Min. : 6.00 Min. : 3.00
## 1st Qu.: 6.00 1st Qu.: 6.00
## Median : 19.00 Median :15.00
## Mean : 32.37 Mean :22.14
## 3rd Qu.: 47.00 3rd Qu.:34.00
## Max. :126.00 Max. :75.00
##
## Performance No.of.times.90.DPD.or.worse.in.last.6.months
## 0 : 0 Min. :0.000
## 1 : 0 1st Qu.:1.000
## NA's:1425 Median :1.000
## Mean :1.316
## 3rd Qu.:2.000
## Max. :3.000
##
## No.of.times.60.DPD.or.worse.in.last.6.months
## Min. :0.000
## 1st Qu.:2.000
## Median :2.000
## Mean :2.335
## 3rd Qu.:3.000
## Max. :5.000
##
## No.of.times.30.DPD.or.worse.in.last.6.months
## Min. :0.00
## 1st Qu.:2.00
## Median :3.00
## Mean :3.21
## 3rd Qu.:4.00
## Max. :7.00
##
## No.of.times.90.DPD.or.worse.in.last.12.months
## Min. :0.000
## 1st Qu.:1.000
## Median :2.000
## Mean :2.191
## 3rd Qu.:3.000
## Max. :5.000
##
## No.of.times.60.DPD.or.worse.in.last.12.months
## Min. :0.000
## 1st Qu.:2.000
## Median :3.000
## Mean :3.209
## 3rd Qu.:4.000
## Max. :7.000
##
## No.of.times.30.DPD.or.worse.in.last.12.months
## Min. :0.000
## 1st Qu.:3.000
## Median :4.000
## Mean :4.086
## 3rd Qu.:5.000
## Max. :9.000
##
## Avgas.CC.Utilization.in.last.12.months
## Min. : 1.00
## 1st Qu.: 35.00
## Median : 51.00
## Mean : 51.08
## 3rd Qu.: 67.00
## Max. :101.00
## NA's :35
## No.of.trades.opened.in.last.6.months
## Min. :0.000
## 1st Qu.:2.000
## Median :3.000
## Mean :2.933
## 3rd Qu.:4.000
## Max. :6.000
##
## No.of.trades.opened.in.last.12.months
## Min. : 0.000
## 1st Qu.: 6.000
## Median : 8.000
## Mean : 7.884
## 3rd Qu.:10.000
## Max. :14.000
##
## No.of.PL.trades.opened.in.last.6.months
## Min. :0.000
## 1st Qu.:1.000
## Median :2.000
## Mean :2.057
## 3rd Qu.:3.000
## Max. :4.000
##
## No.of.PL.trades.opened.in.last.12.months
## Min. :0.000
## 1st Qu.:3.000
## Median :4.000
## Mean :4.064
## 3rd Qu.:5.000
## Max. :8.000
##
## No.of.Inquiries.in.last.6.months..excluding.home...auto.loans.
## Min. :0.000
## 1st Qu.:1.000
## Median :2.000
## Mean :2.036
## 3rd Qu.:3.000
## Max. :4.000
##
## No.of.Inquiries.in.last.12.months..excluding.home...auto.loans.
## Min. :0.000
## 1st Qu.:3.000
## Median :4.000
## Mean :4.036
## 3rd Qu.:5.000
## Max. :8.000
##
## Presence.of.open.home.loan Outstanding.Balance Total.No.of.Trades
## Min. :0.00000 Min. : 17155 Min. : 1.000
## 1st Qu.:0.00000 1st Qu.: 610175 1st Qu.: 7.000
## Median :0.00000 Median : 804672 Median : 9.000
## Mean :0.09895 Mean :1045252 Mean : 8.772
## 3rd Qu.:0.00000 3rd Qu.:1074624 3rd Qu.:11.000
## Max. :1.00000 Max. :4143938 Max. :16.000
##
## Presence.of.open.auto.loan Income_imputed Income_imputed_WoE
## Min. :0.00000 Min. : 4.50 Min. :-0.31412
## 1st Qu.:0.00000 1st Qu.: 5.00 1st Qu.:-0.31412
## Median :0.00000 Median :11.00 Median :-0.08142
## Mean :0.07228 Mean :16.74 Mean :-0.12574
## 3rd Qu.:0.00000 3rd Qu.:24.00 3rd Qu.:-0.06640
## Max. :1.00000 Max. :60.00 Max. : 0.36098
##
## Outstanding.Balance_WoE Outstanding.Balance_imputed
## Min. :-0.4513 Min. : 17155
## 1st Qu.:-0.4370 1st Qu.: 610175
## Median :-0.4028 Median : 804672
## Mean :-0.3014 Mean :1045252
## 3rd Qu.:-0.2547 3rd Qu.:1074624
## Max. : 0.9199 Max. :4143938
##
## Avgas.CC.Utilization.in.last.12.months_WoE
## Min. :-0.5846
## 1st Qu.:-0.5633
## Median :-0.5633
## Mean :-0.4205
## 3rd Qu.:-0.3810
## Max. : 0.8018
##
## Avgas.CC.Utilization.in.last.12.months_imputed
## Min. : 1.00
## 1st Qu.: 36.00
## Median : 52.00
## Mean : 51.81
## 3rd Qu.: 68.00
## Max. :101.00
##
# Using No.of.dependents as numerical only
# customer_master_data$No.of.dependents <- as.factor(customer_master_data$No.of.dependents)
# customer_master_data$No.of.dependents <- as.numeric(customer_master_data$No.of.dependents)
# Gender
customer_master_data$Gender <- as.factor(customer_master_data$Gender)
levels(customer_master_data$Gender) <- c(1,0)
# Marital Status
customer_master_data$Marital.Status..at.the.time.of.application. <- as.factor(customer_master_data$Marital.Status..at.the.time.of.application.)
levels(customer_master_data$Marital.Status..at.the.time.of.application.) <- c(1,0)
# Type of Residence
customer_master_data$Type.of.residence <- as.factor(customer_master_data$Type.of.residence)
# One-Hot encoding for Education_imputed
customer_master_data$Education_imputed <- as.factor(customer_master_data$Education_imputed)
dummy_education <- data.frame(model.matrix(~Education_imputed,
data=customer_master_data))
dummy_education <- dummy_education[,-1]
customer_master_data <- cbind(customer_master_data, dummy_education)
# One-Hot encoding for Profession
customer_master_data$Profession <- as.factor(customer_master_data$Profession)
dummy_profession <- data.frame(model.matrix(~Profession,
data=customer_master_data))
dummy_profession <- dummy_profession[,-1]
customer_master_data <- cbind(customer_master_data, dummy_profession)
# One-Hot encoding for Residence Type
customer_master_data$Type.of.residence <- as.factor(customer_master_data$Type.of.residence)
dummy_residencetype <- data.frame(model.matrix(~Type.of.residence,
data=customer_master_data))
dummy_residencetype <- dummy_residencetype[,-1]
customer_master_data <- cbind(customer_master_data, dummy_residencetype)
# Creating a .CSV file with WoE values
write.csv(customer_master_data,"customer_master_data_cleaned_WoE_feature_engineering.csv")
graph_data <- customer_master_data
# Columns considered to convert into factor
toFactor_colname <- c("Gender","Marital.Status..at.the.time.of.application.",
"No.of.dependents","Education","Profession",
"Income_bin","age_bin","current_residence_bin",
"current_company_bin","Type.of.residence",
"Presence.of.open.auto.loan","Presence.of.open.home.loan",
"No.of.times.30.DPD.or.worse.in.last.12.months",
"No.of.times.30.DPD.or.worse.in.last.6.months",
"No.of.times.60.DPD.or.worse.in.last.12.months",
"No.of.times.60.DPD.or.worse.in.last.6.months",
"No.of.times.90.DPD.or.worse.in.last.12.months",
"No.of.times.90.DPD.or.worse.in.last.6.months")
graph_data[toFactor_colname] <- lapply(graph_data[toFactor_colname],factor)
graph_data$Performance <- as.factor(ifelse(graph_data$Performance==0,
"Non-Defaulters",
"Defaulters"))
ggplot(graph_data,aes(x=Performance,fill=Performance)) +
geom_bar() +
geom_text(stat = "count", aes(y = ((..count..)/sum(..count..)),
label = scales::percent((..count..)/sum(..count..))),
vjust =-0.25)
# Only 4.2% defaulters and this is an unbalanced data
str(graph_data)
## 'data.frame': 69802 obs. of 54 variables:
## $ Application.ID : int 100450 128993 142768 176721 197956 203973 210394 223052 237197 247959 ...
## $ Age : int 52 36 55 55 28 43 42 51 44 59 ...
## $ Gender : Factor w/ 2 levels "1","0": 2 2 2 2 1 1 2 2 2 2 ...
## $ Marital.Status..at.the.time.of.application. : Factor w/ 2 levels "1","0": 1 1 1 1 1 2 1 1 1 1 ...
## $ No.of.dependents : Factor w/ 6 levels "0","1","2","3",..: 5 5 2 4 4 2 3 4 5 6 ...
## $ Income : num 32 13 29 53 35 35 43 4.5 5 40 ...
## $ Education : Factor w/ 6 levels "","Bachelor",..: 6 6 5 6 3 6 2 3 6 3 ...
## $ Profession : Factor w/ 3 levels "SAL","SE","SE_PROF": 3 1 1 3 1 1 3 1 1 1 ...
## $ Type.of.residence : Factor w/ 5 levels "Company provided",..: 5 5 5 5 5 5 5 5 5 4 ...
## $ No.of.months.in.current.residence : int 79 6 46 6 6 6 6 83 6 6 ...
## $ No.of.months.in.current.company : int 3 21 3 27 43 52 3 48 38 5 ...
## $ Performance : Factor w/ 2 levels "Defaulters","Non-Defaulters": 2 1 2 2 2 2 2 2 2 2 ...
## $ No.of.times.90.DPD.or.worse.in.last.6.months : Factor w/ 4 levels "0","1","2","3": 1 2 1 1 1 1 1 2 1 1 ...
## $ No.of.times.60.DPD.or.worse.in.last.6.months : Factor w/ 6 levels "0","1","2","3",..: 1 2 2 1 1 1 1 2 1 1 ...
## $ No.of.times.30.DPD.or.worse.in.last.6.months : Factor w/ 8 levels "0","1","2","3",..: 1 3 2 1 1 1 1 2 1 1 ...
## $ No.of.times.90.DPD.or.worse.in.last.12.months : Factor w/ 6 levels "0","1","2","3",..: 1 2 2 1 1 1 1 2 1 1 ...
## $ No.of.times.60.DPD.or.worse.in.last.12.months : Factor w/ 8 levels "0","1","2","3",..: 1 2 2 1 1 1 1 2 1 1 ...
## $ No.of.times.30.DPD.or.worse.in.last.12.months : Factor w/ 10 levels "0","1","2","3",..: 1 3 2 1 1 1 1 2 1 1 ...
## $ Avgas.CC.Utilization.in.last.12.months : int 113 9 34 18 3 6 0 48 13 16 ...
## $ No.of.trades.opened.in.last.6.months : num 2 1 2 0 7 1 1 2 1 2 ...
## $ No.of.trades.opened.in.last.12.months : int 8 6 7 1 13 2 2 5 3 4 ...
## $ No.of.PL.trades.opened.in.last.6.months : int 2 1 2 0 5 0 0 2 0 1 ...
## $ No.of.PL.trades.opened.in.last.12.months : int 5 4 4 1 5 0 1 3 1 1 ...
## $ No.of.Inquiries.in.last.6.months..excluding.home...auto.loans. : int 1 4 1 2 6 0 1 2 2 1 ...
## $ No.of.Inquiries.in.last.12.months..excluding.home...auto.loans.: int 3 7 2 3 11 0 3 5 4 1 ...
## $ Presence.of.open.home.loan : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 1 ...
## $ Outstanding.Balance : int 3903438 741058 815325 209593 992024 556 202816 575772 204444 199818 ...
## $ Total.No.of.Trades : int 9 8 9 3 25 5 4 6 5 7 ...
## $ Presence.of.open.auto.loan : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Income_imputed : num 32 13 29 53 35 35 43 4.5 5 40 ...
## $ Income_bin : Ord.factor w/ 6 levels "<0"<"1-10"<"11-20"<..: 5 3 4 6 5 5 6 2 2 5 ...
## $ age_bin : Ord.factor w/ 4 levels "<35"<"35-45"<..: 3 2 4 4 1 2 2 3 2 4 ...
## $ current_residence_bin : Ord.factor w/ 11 levels "<12"<"13-24"<..: 7 1 4 1 1 1 1 7 1 1 ...
## $ current_company_bin : Ord.factor w/ 10 levels "(1,12]"<"(12,24]"<..: 1 2 1 3 4 5 1 4 4 1 ...
## $ Performance.Tag_forIV : num 1 0 1 1 1 1 1 1 1 1 ...
## $ Education_WoE : num 0.01765 0.01765 0.02826 0.01765 -0.00804 ...
## $ Education_imputed : Factor w/ 5 levels "Bachelor","Masters",..: 5 5 4 5 2 5 1 2 5 2 ...
## $ Income_imputed_WoE : num 0.1551 -0.0664 -0.0796 0.361 0.1551 ...
## $ Presence.of.open.home.loan_WoE : num 0.2374 -0.0738 -0.0738 -0.0738 -0.0738 ...
## $ Presence.of.open.home.loan_imputed : num 1 0 0 0 0 0 0 0 0 0 ...
## $ Outstanding.Balance_WoE : num -0.296 -0.451 -0.437 0.134 -0.403 ...
## $ Outstanding.Balance_imputed : int 3903438 741058 815325 209593 992024 556 202816 575772 204444 199818 ...
## $ Avgas.CC.Utilization.in.last.12.months_WoE : num -0.381 0.6768 -0.4753 0.0792 0.8018 ...
## $ Avgas.CC.Utilization.in.last.12.months_imputed : int 113 9 34 18 3 6 0 48 13 16 ...
## $ Education_imputedMasters : num 0 0 0 0 1 0 0 1 0 1 ...
## $ Education_imputedOthers : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Education_imputedPhd : num 0 0 1 0 0 0 0 0 0 0 ...
## $ Education_imputedProfessional : num 1 1 0 1 0 1 0 0 1 0 ...
## $ ProfessionSE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ProfessionSE_PROF : num 1 0 0 1 0 0 1 0 0 0 ...
## $ Type.of.residenceLiving.with.Parents : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Type.of.residenceOthers : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Type.of.residenceOwned : num 0 0 0 0 0 0 0 0 0 1 ...
## $ Type.of.residenceRented : num 1 1 1 1 1 1 1 1 1 0 ...
graph_data_categorical <- graph_data[,sapply(graph_data,is.factor)]
graph_data_continuous <- graph_data[,!sapply(graph_data,is.factor)]
graph_data_continuous <- graph_data_continuous[,-c(1,2)]
graph_data_categorical <- graph_data_categorical %>% dplyr::select(-Performance,Performance)
View(graph_data_categorical)
View(graph_data_continuous)
gather(graph_data_categorical, x, y, Gender:current_company_bin) %>%
ggplot(aes(x = y, color = Performance, fill = Performance)) +
geom_density(alpha = 0.3) +
facet_wrap( ~ x, scales = "free", ncol = 3)
# Histograms
# Excluding variables of Type - Original with NA/Missing Values, Dummy Variables & WoE Values
graph_data_continuous [, -c(1, 4, 11, 14, 15, 16,
17, 19, 21, 23, 24, 25,
26, 27, 28, 29, 30, 31, 32) ] %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap( ~ key, scales = "free") + geom_histogram()
## boxplots
graph_data_continuous_1 <- graph_data_continuous %>% dplyr::select(-Outstanding.Balance,Outstanding.Balance)
new_col_names <- c("Age","Income","No.curr.resi","No.curr.comp",
"AvgCC.Util.12","trades_6","trades_12",
"PL_6","PL_12","inq_6_auto","In_12_auto",
"total_trade","Perf","Outstanding.balance")
colnames(graph_data_continuous_1) <- new_col_names
temp <- melt(graph_data_continuous_1[,1:13],id.vars = "Perf")
ggplot(data = temp, aes(x=variable, y=value)) + geom_boxplot(aes(fill=Perf))
boxplot(graph_data_continuous_1$Outstanding.balance)
graph_data_multivariate <- dplyr::filter(graph_data,graph_data$Performance=="Defaulters")
ggplot(data = graph_data_multivariate, aes(x = age_bin,y=Performance, fill = Income_bin)) +
geom_bar(aes(y = prop.table(..count..) * 100),
position = "dodge") +
geom_text(aes(y = round(prop.table(..count..) * 100 + 0.5,2),
label = paste0(round(prop.table(..count..) * 100,2), '%')),
stat = 'count',
position = position_dodge(.9),
size = 3) +
labs(x = 'Age group', y = 'Defaulters', fill = 'Income Group')
ggplot(data = graph_data_multivariate, aes(x = age_bin,y=Performance, fill = Gender)) +
geom_bar(aes(y = prop.table(..count..) * 100),
position = "dodge") +
geom_text(aes(y = round(prop.table(..count..) * 100 + 0.5,2),
label = paste0(round(prop.table(..count..) * 100,2), '%')),
stat = 'count',
position = position_dodge(.9),
size = 3) +
labs(x = 'Age group', y = 'Defaulters', fill = 'Gender')
ggplot(data = graph_data_multivariate, aes(x = Income_bin,y=Performance, fill = Gender)) +
geom_bar(aes(y = prop.table(..count..) * 100),
position = "dodge") +
geom_text(aes(y = round(prop.table(..count..) * 100 + 0.5,2),
label = paste0(round(prop.table(..count..) * 100,2), '%')),
stat = 'count',
position = position_dodge(.9),
size = 3) +
labs(x = 'Income group', y = 'Defaulters', fill = 'Gender')
ggplot(data = graph_data_multivariate, aes(x = No.of.dependents,y=Performance, fill = Income_bin)) +
geom_bar(aes(y = prop.table(..count..) * 100),
position = "dodge") +
geom_text(aes(y = round(prop.table(..count..) * 100 + 0.5,2),
label = paste0(round(prop.table(..count..) * 100,2), '%')),
stat = 'count',
position = position_dodge(.9),
size = 3) +
labs(x = 'Number of Dependents', y = 'Defaulters', fill = 'Income group')
ggplot(data = graph_data_multivariate, aes(x = Type.of.residence,y=Performance, fill = Income_bin)) +
geom_bar(aes(y = prop.table(..count..) * 100),
position = "dodge") +
geom_text(aes(y = round(prop.table(..count..) * 100 + 0.5,2),
label = paste0(round(prop.table(..count..) * 100,2), '%')),
stat = 'count',
position = position_dodge(.9),
size = 3) +
labs(x = 'Type of Residence', y = 'Defaulters', fill = 'Income group')
Top 7 Importat variables are highlighted
Variable Importance Conclusion
Age - High - Age group of 35-55 is significant
Gender - Low - Not significant feature
Marital Status - Medium - EDA also confirms Married Significant
No of dependents - Low - Not significant feature
Income - Low - Not significant feature
Education - Low - Not significant feature
Profession - High - Salaried is significant with high frequency
Type of residence - High - Rented is the most significant with high
No of months in current residence - High - < 12 months is high frequency
No of months in current company - Medium - EDA also confirms <24 Months has significant default
No of times 90 DPD or worse in last 6 months - Medium - Higher the number has default effect
No of times 60 DPD or worse in last 6 months - Medium - Higher the number has default effect
No of times 30 DPD or worse in last 6 months - Medium - Higher the number has default effect
No of times 90 DPD or worse in last 12 months - Medium - Higher the number has default effect
No of times 60 DPD or worse in last 12 months - Medium - Higher the number has default effect
No of times 30 DPD or worse in last 12 months - Low - Not significant feature
Avgas CC Utilization in last 12 months - High - Most of the utilization are <20
No of trades opened in last 6 months - Low - Not significant feature
No of trades opened in last 12 months - Low - Not significant feature
No of PL trades opened in last 6 months - Low - Not significant feature
No of PL trades opened in last 12 months - Low - Not significant feature
No of Inquiries in last 6 months (excluding home & auto loans) - Low - Not significant feature
No of Inquiries in last 12 months (excluding home & auto loans) - Low - Not significant feature
Presence of open home loan - Low - Not significant feature
Outstanding Balance - Low - Not significant feature
Total No of Trades - Low - Not significant feature
Presence of open auto loan - Low - Not significant feature
# Feature Selection based on WoE/IV = 0.02 to 0.5
arrange(IV_Value [IV_Value$IV >=0.02, ], desc(IV))[1]
## Variable
## 1 Avgas.CC.Utilization.in.last.12.months
## 2 No.of.trades.opened.in.last.12.months
## 3 No.of.PL.trades.opened.in.last.12.months
## 4 No.of.Inquiries.in.last.12.months..excluding.home...auto.loans.
## 5 Outstanding.Balance
## 6 No.of.times.30.DPD.or.worse.in.last.6.months
## 7 Total.No.of.Trades
## 8 No.of.PL.trades.opened.in.last.6.months
## 9 No.of.times.90.DPD.or.worse.in.last.12.months
## 10 No.of.times.60.DPD.or.worse.in.last.6.months
## 11 No.of.Inquiries.in.last.6.months..excluding.home...auto.loans.
## 12 No.of.times.30.DPD.or.worse.in.last.12.months
## 13 No.of.trades.opened.in.last.6.months
## 14 No.of.times.60.DPD.or.worse.in.last.12.months
## 15 No.of.times.90.DPD.or.worse.in.last.6.months
## 16 No.of.months.in.current.residence
## 17 current_residence_bin
## 18 Income_imputed
## 19 Income
## 20 Income_bin
## 21 No.of.months.in.current.company
# Correlation matrix
features_for_correlationMatrix <-
c(
"Income_imputed",
"No.of.months.in.current.company",
"No.of.months.in.current.residence",
"Avgas.CC.Utilization.in.last.12.months_WoE",
"Avgas.CC.Utilization.in.last.12.months_imputed",
"Outstanding.Balance_WoE",
"Outstanding.Balance_imputed",
#"Presence.of.open.home.loan_WoE",
#"Presence.of.open.home.loan_imputed",
"Total.No.of.Trades",
"No.of.trades.opened.in.last.6.months",
"No.of.trades.opened.in.last.12.months",
"No.of.PL.trades.opened.in.last.6.months",
"No.of.PL.trades.opened.in.last.12.months",
"No.of.Inquiries.in.last.12.months..excluding.home...auto.loans.",
"No.of.Inquiries.in.last.6.months..excluding.home...auto.loans.",
"No.of.times.30.DPD.or.worse.in.last.6.months",
"No.of.times.60.DPD.or.worse.in.last.6.months",
"No.of.times.90.DPD.or.worse.in.last.6.months",
"No.of.times.30.DPD.or.worse.in.last.12.months",
"No.of.times.60.DPD.or.worse.in.last.12.months",
"No.of.times.90.DPD.or.worse.in.last.12.months"
)
plot_correlationMatrix (customer_master_data, features_for_correlationMatrix)
## Final Feature selection
# Following are list of features which are not having high correlation -0.5 to 0.5 with other featues
demographic_data_features <- c("Income_imputed",
"No.of.months.in.current.company",
"No.of.months.in.current.residence")
creditbureau_data_features <- c( "Avgas.CC.Utilization.in.last.12.months_WoE",
"Avgas.CC.Utilization.in.last.12.months_imputed",
"Outstanding.Balance_WoE",
"Outstanding.Balance_imputed",
"No.of.times.30.DPD.or.worse.in.last.6.months",
"No.of.trades.opened.in.last.12.months")
# Not using scale technique as there is no gain/loss with model performance/accuracy metrics
# customer_master_data[, scale_col] <- sapply(customer_master_data[, scale_col], scale)
View(customer_master_data)
set.seed(100)
# Randomly divide the data into training and test sets (stratified by class)
index <- createDataPartition(customer_master_data$Performance, p = 0.7, list = FALSE)
train_data <- customer_master_data[index, ]
summary(train_data$Performance)
## 0 1
## 46800 2063
2063/(46800+2063)*100
## [1] 4.222008
test_data <- customer_master_data[-index, ]
summary(test_data$Performance)
## 0 1
## 20056 883
883/(20056+883)*100
## [1] 4.217011
test_actual_default <- factor(ifelse(test_data$Performance ==1,"Yes","No"))
# Function for Choosing the optimal probalility cutoff
perform_fn <- function(cutoff, test_data_prediction)
{
predicted_default <- factor(ifelse(test_data_prediction >= cutoff, "Yes", "No"))
conf <- confusionMatrix(predicted_default, test_actual_default, positive = "Yes")
acc <- conf$overall[1]
sens <- conf$byClass[1]
spec <- conf$byClass[2]
out <- t(as.matrix(c(sens, spec, acc)))
colnames(out) <- c("sensitivity", "specificity", "accuracy")
return(out)
}
# Function for calculating Optimal Cutoff
findOptimalCutOff <- function (test_data_prediction, thresholdStart, thresholdEnd) {
# Summary of test probability
summary(test_data_prediction)
#s = seq(.03,.14,length=100)
s = seq(thresholdStart, thresholdEnd,length=100)
OUT = matrix(0,100,3)
for(i in 1:100)
{
OUT[i,] = perform_fn(s[i], test_data_prediction)
}
plot(s, OUT[,1],xlab="Cutoff",ylab="Value",cex.lab=1.5,cex.axis=1.5,ylim=c(0,1),type="l",lwd=2,axes=FALSE,col=2)
axis(1,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
axis(2,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
lines(s,OUT[,2],col="darkgreen",lwd=2)
lines(s,OUT[,3],col=4,lwd=2)
box()
legend(0.75,0.50,col=c(2,"darkgreen",4,"darkred"),lwd=c(2,2,2,2),c("Sensitivity","Specificity","Accuracy"))
cutoff <- s[which(abs(OUT[,1]-OUT[,2])<0.02)]
cat("Optimal Cutoff = ", round(cutoff,3)[1])
cat ("\n\n")
return(round(cutoff,3)[1])
}
## Function for calculation of lift and cumulative gain
lift <- function(labels , predicted_prob, groups=10) {
if(is.factor(labels)) labels <- as.integer(as.character(labels ))
if(is.factor(predicted_prob)) predicted_prob <- as.integer(as.character(predicted_prob))
helper = data.frame(cbind(labels , predicted_prob))
helper[,"bucket"] = ntile(-helper[,"predicted_prob"], groups)
gaintable = helper %>% group_by(bucket) %>%
summarise_at(vars(labels ), funs(total = n(),
totalresp=sum(., na.rm = TRUE))) %>%
mutate(Cumresp = cumsum(totalresp),
Gain=Cumresp/sum(totalresp)*100,
Cumlift=Gain/(bucket*(100/groups)))
return(gaintable)
}
# For plotting the gain chart and to compute KS Statistic
GainLiftChart_KSStatistic <- function(model,data, value) {
temp <- data
temp$Predict <- predict(model,type=value,newdata=temp)
LG = lift(temp$Performance, temp$Predict, groups = 10)
# Gain Chart
plot(LG$bucket,LG$Gain,col="red",type="l",main="Gain Chart",xlab="% of total targeted",ylab = "% of positive Response")
# Lift Chart
plot(LG$bucket,LG$Cumlift,col="blue",type="l",main="Lift Chart",xlab="% of total targeted",ylab = "Lift")
write.csv(LG,"Lift-CumulativeGain-table.csv")
# KS-Statistic
if(value=="raw"){
pred_object_test<- prediction(as.numeric(temp$Predict), as.numeric(temp$Performance))
}else{
pred_object_test<- prediction(temp$Predict,temp$Performance)
}
performance_measures_test<- performance(pred_object_test, "tpr", "fpr")
ks_table_test <- attr(performance_measures_test, "y.values")[[1]] -
(attr(performance_measures_test, "x.values")[[1]])
#LG$KS <- ks_table_test
print(LG)
max(ks_table_test)
}
evaluateClassificationModel <- function (test_pred, test_actual_default, cutOff) {
# Get optimal cut-off
#cutOff <- findOptimalCutOff(test_pred, thresholdStart, thresholdEnd)
test_pred_default <- factor(ifelse(test_pred >= cutOff, "Yes", "No"))
table(test_actual_default,test_pred_default)
#install.packages("e1071")
library(e1071)
test_conf <- confusionMatrix(test_pred_default, test_actual_default, positive = "Yes")
acc <- test_conf$overall[1]
sens <- test_conf$byClass[1]
spec <- test_conf$byClass[2]
print(test_conf)
precission_recall_f <- accuracy.meas(test_actual_default, test_pred_default, cutOff)
# Using only F score
# Precision can be seen as a measure of exactness or quality, whereas recall is a measure of completeness or
# quantity. In simple terms, high precision means that an algorithm returned substantially more relevant
# results than irrelevant ones, while high recall means that an algorithm returned most of the relevant results.
roc_metrics <- roc.curve(test_actual_default, test_pred_default, plotit = T)
metrics <- data.frame(Accuracy=acc,
Sensitivity=sens,
Specificity = spec,
F_score=precission_recall_f$F,
Threshold=precission_recall_f$threshold,
AUC=roc_metrics$auc,
False_positive_Rate=roc_metrics$false.positive.rate[2],
True_positive_Rate=roc_metrics$true.positive.rate[2])
print(metrics)
return(metrics)
}
# Simple Logistic Regression model Using Demographic data
# Using WoE Variables as derived features in addition original features (imputed for mising / incorrect values)
# As WoE is created based on Target Encoding, in general is not directly correlated with base variable
# Using WoE Variable (Note: results are same with Income_imputed as well instead of Income_imputed_WoE)
# Also Income_imputed becomes significant when both are used and they both are highly correlated
logistic_model_demographic_data_unbalanced <- glm(formula = Performance ~ No.of.months.in.current.residence +
Income_imputed_WoE +
No.of.months.in.current.company,
family = "binomial",
data = train_data [, -1])
summary (logistic_model_demographic_data_unbalanced)
##
## Call:
## glm(formula = Performance ~ No.of.months.in.current.residence +
## Income_imputed_WoE + No.of.months.in.current.company, family = "binomial",
## data = train_data[, -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4036 -0.3147 -0.2860 -0.2605 2.7860
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.0071342 0.0496656 -60.548 < 2e-16
## No.of.months.in.current.residence 0.0018688 0.0005978 3.126 0.00177
## Income_imputed_WoE -1.0279515 0.1093138 -9.404 < 2e-16
## No.of.months.in.current.company -0.0054741 0.0011243 -4.869 1.12e-06
##
## (Intercept) ***
## No.of.months.in.current.residence **
## Income_imputed_WoE ***
## No.of.months.in.current.company ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 17096 on 48862 degrees of freedom
## Residual deviance: 16971 on 48859 degrees of freedom
## AIC: 16979
##
## Number of Fisher Scoring iterations: 6
# AIC: 16983
sort(vif(logistic_model_demographic_data_unbalanced),decreasing = TRUE)
## Income_imputed_WoE No.of.months.in.current.company
## 1.015650 1.015548
## No.of.months.in.current.residence
## 1.015372
# Income_imputed_WoE No.of.months.in.current.residence No.of.months.in.current.company
# 1.017824 1.016867 1.016003
test_pred = predict(logistic_model_demographic_data_unbalanced, type = "response",
newdata = test_data)
summary(test_pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01471 0.03453 0.04085 0.04224 0.04879 0.07793
cutOff <- findOptimalCutOff(test_pred, .03,.14)
## Optimal Cutoff = 0.042
# Optimal Cutoff = 0.042
logistic_model_demographic_data_metrics <- evaluateClassificationModel(test_pred,
test_actual_default,
cutOff)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 10813 394
## Yes 9243 489
##
## Accuracy : 0.5398
## 95% CI : (0.533, 0.5465)
## No Information Rate : 0.9578
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0161
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.55379
## Specificity : 0.53914
## Pos Pred Value : 0.05025
## Neg Pred Value : 0.96484
## Prevalence : 0.04217
## Detection Rate : 0.02335
## Detection Prevalence : 0.46478
## Balanced Accuracy : 0.54647
##
## 'Positive' Class : Yes
##
## Accuracy Sensitivity Specificity F_score Threshold AUC
## Accuracy 0.5397583 0.5537939 0.5391404 0.04046375 0.042 0.5464671
## False_positive_Rate True_positive_Rate
## Accuracy 0.4608596 0.5537939
rownames(logistic_model_demographic_data_metrics) <- "DemographicData - GLM - Unbalanced"
model_Metrics <- rbind(logistic_model_demographic_data_metrics)
# test_pred_default <- factor(ifelse(test_pred >= 0.040, "Yes", "No"))
# Accuracy : 0.5373
# Sensitivity : 0.55606
# Specificity : 0.53650
#
# F : 0.040
#
# Area under the curve (AUC): 0.546
# Using WoE Variables as derived features in addition original features (imputed for mising / incorrect values)
# As WoE is created based on Target Encoding, in general is not directly correlated with base variable
logistic_model_application_and_creditdata <- glm(formula = Performance ~ Income_imputed +
No.of.months.in.current.company +
No.of.months.in.current.residence +
Avgas.CC.Utilization.in.last.12.months_WoE +
Avgas.CC.Utilization.in.last.12.months_imputed +
Outstanding.Balance_WoE +
Outstanding.Balance_imputed +
No.of.times.30.DPD.or.worse.in.last.6.months +
No.of.trades.opened.in.last.12.months,
family = "binomial",
data = train_data [, -1])
summary (logistic_model_application_and_creditdata)
##
## Call:
## glm(formula = Performance ~ Income_imputed + No.of.months.in.current.company +
## No.of.months.in.current.residence + Avgas.CC.Utilization.in.last.12.months_WoE +
## Avgas.CC.Utilization.in.last.12.months_imputed + Outstanding.Balance_WoE +
## Outstanding.Balance_imputed + No.of.times.30.DPD.or.worse.in.last.6.months +
## No.of.trades.opened.in.last.12.months, family = "binomial",
## data = train_data[, -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6628 -0.3459 -0.2568 -0.1929 2.9267
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -3.321e+00 9.715e-02
## Income_imputed -2.277e-03 1.563e-03
## No.of.months.in.current.company -2.086e-03 1.132e-03
## No.of.months.in.current.residence -1.231e-03 6.731e-04
## Avgas.CC.Utilization.in.last.12.months_WoE -5.179e-01 7.208e-02
## Avgas.CC.Utilization.in.last.12.months_imputed 2.838e-03 1.013e-03
## Outstanding.Balance_WoE -2.478e-01 7.023e-02
## Outstanding.Balance_imputed -1.089e-08 1.969e-08
## No.of.times.30.DPD.or.worse.in.last.6.months 1.692e-01 2.045e-02
## No.of.trades.opened.in.last.12.months 2.458e-02 5.929e-03
## z value Pr(>|z|)
## (Intercept) -34.189 < 2e-16 ***
## Income_imputed -1.457 0.145137
## No.of.months.in.current.company -1.844 0.065240 .
## No.of.months.in.current.residence -1.829 0.067357 .
## Avgas.CC.Utilization.in.last.12.months_WoE -7.185 6.74e-13 ***
## Avgas.CC.Utilization.in.last.12.months_imputed 2.802 0.005083 **
## Outstanding.Balance_WoE -3.529 0.000418 ***
## Outstanding.Balance_imputed -0.553 0.580165
## No.of.times.30.DPD.or.worse.in.last.6.months 8.273 < 2e-16 ***
## No.of.trades.opened.in.last.12.months 4.146 3.38e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 17096 on 48862 degrees of freedom
## Residual deviance: 16354 on 48853 degrees of freedom
## AIC: 16374
##
## Number of Fisher Scoring iterations: 6
# AIC: 16374
logistic_model_application_and_creditdata_2 <- stepAIC(logistic_model_application_and_creditdata, direction="both")
## Start: AIC=16373.54
## Performance ~ Income_imputed + No.of.months.in.current.company +
## No.of.months.in.current.residence + Avgas.CC.Utilization.in.last.12.months_WoE +
## Avgas.CC.Utilization.in.last.12.months_imputed + Outstanding.Balance_WoE +
## Outstanding.Balance_imputed + No.of.times.30.DPD.or.worse.in.last.6.months +
## No.of.trades.opened.in.last.12.months
##
## Df Deviance AIC
## - Outstanding.Balance_imputed 1 16354 16372
## <none> 16354 16374
## - Income_imputed 1 16356 16374
## - No.of.months.in.current.residence 1 16357 16375
## - No.of.months.in.current.company 1 16357 16375
## - Avgas.CC.Utilization.in.last.12.months_imputed 1 16361 16379
## - Outstanding.Balance_WoE 1 16366 16384
## - No.of.trades.opened.in.last.12.months 1 16370 16388
## - Avgas.CC.Utilization.in.last.12.months_WoE 1 16406 16424
## - No.of.times.30.DPD.or.worse.in.last.6.months 1 16418 16436
##
## Step: AIC=16371.84
## Performance ~ Income_imputed + No.of.months.in.current.company +
## No.of.months.in.current.residence + Avgas.CC.Utilization.in.last.12.months_WoE +
## Avgas.CC.Utilization.in.last.12.months_imputed + Outstanding.Balance_WoE +
## No.of.times.30.DPD.or.worse.in.last.6.months + No.of.trades.opened.in.last.12.months
##
## Df Deviance AIC
## <none> 16354 16372
## - Income_imputed 1 16356 16372
## - No.of.months.in.current.company 1 16357 16373
## - No.of.months.in.current.residence 1 16357 16373
## + Outstanding.Balance_imputed 1 16354 16374
## - Avgas.CC.Utilization.in.last.12.months_imputed 1 16362 16378
## - Outstanding.Balance_WoE 1 16367 16383
## - No.of.trades.opened.in.last.12.months 1 16370 16386
## - Avgas.CC.Utilization.in.last.12.months_WoE 1 16406 16422
## - No.of.times.30.DPD.or.worse.in.last.6.months 1 16419 16435
summary(logistic_model_application_and_creditdata_2)
##
## Call:
## glm(formula = Performance ~ Income_imputed + No.of.months.in.current.company +
## No.of.months.in.current.residence + Avgas.CC.Utilization.in.last.12.months_WoE +
## Avgas.CC.Utilization.in.last.12.months_imputed + Outstanding.Balance_WoE +
## No.of.times.30.DPD.or.worse.in.last.6.months + No.of.trades.opened.in.last.12.months,
## family = "binomial", data = train_data[, -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6613 -0.3459 -0.2565 -0.1930 2.9202
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -3.3289262 0.0961958
## Income_imputed -0.0022912 0.0015630
## No.of.months.in.current.company -0.0020907 0.0011318
## No.of.months.in.current.residence -0.0012437 0.0006727
## Avgas.CC.Utilization.in.last.12.months_WoE -0.5177126 0.0720487
## Avgas.CC.Utilization.in.last.12.months_imputed 0.0028376 0.0010127
## Outstanding.Balance_WoE -0.2522001 0.0697798
## No.of.times.30.DPD.or.worse.in.last.6.months 0.1694480 0.0204417
## No.of.trades.opened.in.last.12.months 0.0237502 0.0057367
## z value Pr(>|z|)
## (Intercept) -34.606 < 2e-16 ***
## Income_imputed -1.466 0.142681
## No.of.months.in.current.company -1.847 0.064712 .
## No.of.months.in.current.residence -1.849 0.064485 .
## Avgas.CC.Utilization.in.last.12.months_WoE -7.186 6.69e-13 ***
## Avgas.CC.Utilization.in.last.12.months_imputed 2.802 0.005078 **
## Outstanding.Balance_WoE -3.614 0.000301 ***
## No.of.times.30.DPD.or.worse.in.last.6.months 8.289 < 2e-16 ***
## No.of.trades.opened.in.last.12.months 4.140 3.47e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 17096 on 48862 degrees of freedom
## Residual deviance: 16354 on 48854 degrees of freedom
## AIC: 16372
##
## Number of Fisher Scoring iterations: 6
# AIC: 16372
# Removing Income_imputed as p-value = 0.142681
logistic_model_application_and_creditdata_3 <- glm(formula = Performance ~
No.of.months.in.current.company +
No.of.months.in.current.residence +
Avgas.CC.Utilization.in.last.12.months_WoE +
Avgas.CC.Utilization.in.last.12.months_imputed +
Outstanding.Balance_WoE +
Outstanding.Balance_imputed +
No.of.times.30.DPD.or.worse.in.last.6.months +
No.of.trades.opened.in.last.12.months,
family = "binomial",
data = train_data[, -1])
summary(logistic_model_application_and_creditdata_3)
##
## Call:
## glm(formula = Performance ~ No.of.months.in.current.company +
## No.of.months.in.current.residence + Avgas.CC.Utilization.in.last.12.months_WoE +
## Avgas.CC.Utilization.in.last.12.months_imputed + Outstanding.Balance_WoE +
## Outstanding.Balance_imputed + No.of.times.30.DPD.or.worse.in.last.6.months +
## No.of.trades.opened.in.last.12.months, family = "binomial",
## data = train_data[, -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6689 -0.3461 -0.2565 -0.1926 2.9160
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -3.393e+00 8.375e-02
## No.of.months.in.current.company -1.904e-03 1.124e-03
## No.of.months.in.current.residence -1.165e-03 6.712e-04
## Avgas.CC.Utilization.in.last.12.months_WoE -5.253e-01 7.193e-02
## Avgas.CC.Utilization.in.last.12.months_imputed 2.833e-03 1.013e-03
## Outstanding.Balance_WoE -2.516e-01 7.021e-02
## Outstanding.Balance_imputed -1.135e-08 1.969e-08
## No.of.times.30.DPD.or.worse.in.last.6.months 1.731e-01 2.026e-02
## No.of.trades.opened.in.last.12.months 2.493e-02 5.922e-03
## z value Pr(>|z|)
## (Intercept) -40.521 < 2e-16 ***
## No.of.months.in.current.company -1.694 0.090301 .
## No.of.months.in.current.residence -1.735 0.082706 .
## Avgas.CC.Utilization.in.last.12.months_WoE -7.303 2.81e-13 ***
## Avgas.CC.Utilization.in.last.12.months_imputed 2.798 0.005143 **
## Outstanding.Balance_WoE -3.583 0.000339 ***
## Outstanding.Balance_imputed -0.576 0.564429
## No.of.times.30.DPD.or.worse.in.last.6.months 8.544 < 2e-16 ***
## No.of.trades.opened.in.last.12.months 4.210 2.55e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 17096 on 48862 degrees of freedom
## Residual deviance: 16356 on 48854 degrees of freedom
## AIC: 16374
##
## Number of Fisher Scoring iterations: 6
# AIC: 16374
sort(vif(logistic_model_application_and_creditdata_3),decreasing = TRUE)
## Avgas.CC.Utilization.in.last.12.months_WoE
## 2.736091
## Outstanding.Balance_WoE
## 1.987564
## Avgas.CC.Utilization.in.last.12.months_imputed
## 1.923009
## No.of.trades.opened.in.last.12.months
## 1.528385
## No.of.times.30.DPD.or.worse.in.last.6.months
## 1.329917
## No.of.months.in.current.residence
## 1.111627
## Outstanding.Balance_imputed
## 1.076381
## No.of.months.in.current.company
## 1.031837
# Removing Outstanding.Balance_imputed due to high p-value = 0.564429
logistic_model_application_and_creditdata_4 <- glm(formula = Performance ~
No.of.months.in.current.company +
No.of.months.in.current.residence +
Avgas.CC.Utilization.in.last.12.months_WoE +
Avgas.CC.Utilization.in.last.12.months_imputed +
Outstanding.Balance_WoE +
No.of.times.30.DPD.or.worse.in.last.6.months +
No.of.trades.opened.in.last.12.months,
family = "binomial",
data = train_data[, -1])
summary(logistic_model_application_and_creditdata_4)
##
## Call:
## glm(formula = Performance ~ No.of.months.in.current.company +
## No.of.months.in.current.residence + Avgas.CC.Utilization.in.last.12.months_WoE +
## Avgas.CC.Utilization.in.last.12.months_imputed + Outstanding.Balance_WoE +
## No.of.times.30.DPD.or.worse.in.last.6.months + No.of.trades.opened.in.last.12.months,
## family = "binomial", data = train_data[, -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6673 -0.3461 -0.2562 -0.1928 2.9210
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -3.4018513 0.0825110
## No.of.months.in.current.company -0.0019075 0.0011243
## No.of.months.in.current.residence -0.0011771 0.0006707
## Avgas.CC.Utilization.in.last.12.months_WoE -0.5252183 0.0718986
## Avgas.CC.Utilization.in.last.12.months_imputed 0.0028331 0.0010125
## Outstanding.Balance_WoE -0.2561625 0.0697547
## No.of.times.30.DPD.or.worse.in.last.6.months 0.1734419 0.0202531
## No.of.trades.opened.in.last.12.months 0.0240682 0.0057306
## z value Pr(>|z|)
## (Intercept) -41.229 < 2e-16 ***
## No.of.months.in.current.company -1.697 0.08977 .
## No.of.months.in.current.residence -1.755 0.07927 .
## Avgas.CC.Utilization.in.last.12.months_WoE -7.305 2.77e-13 ***
## Avgas.CC.Utilization.in.last.12.months_imputed 2.798 0.00514 **
## Outstanding.Balance_WoE -3.672 0.00024 ***
## No.of.times.30.DPD.or.worse.in.last.6.months 8.564 < 2e-16 ***
## No.of.trades.opened.in.last.12.months 4.200 2.67e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 17096 on 48862 degrees of freedom
## Residual deviance: 16356 on 48855 degrees of freedom
## AIC: 16372
##
## Number of Fisher Scoring iterations: 6
# AIC: 16372
sort(vif(logistic_model_application_and_creditdata_4),decreasing = TRUE)
## Avgas.CC.Utilization.in.last.12.months_WoE
## 2.733765
## Outstanding.Balance_WoE
## 1.959824
## Avgas.CC.Utilization.in.last.12.months_imputed
## 1.922838
## No.of.trades.opened.in.last.12.months
## 1.430790
## No.of.times.30.DPD.or.worse.in.last.6.months
## 1.328788
## No.of.months.in.current.residence
## 1.110156
## No.of.months.in.current.company
## 1.031806
# Removing No.of.months.in.current.company as p-value = 0.08977
logistic_model_application_and_creditdata_5 <- glm(formula = Performance ~
No.of.months.in.current.residence +
Avgas.CC.Utilization.in.last.12.months_WoE +
Avgas.CC.Utilization.in.last.12.months_imputed +
Outstanding.Balance_WoE +
No.of.times.30.DPD.or.worse.in.last.6.months +
No.of.trades.opened.in.last.12.months,
family = "binomial",
data = train_data[, -1])
summary(logistic_model_application_and_creditdata_5)
##
## Call:
## glm(formula = Performance ~ No.of.months.in.current.residence +
## Avgas.CC.Utilization.in.last.12.months_WoE + Avgas.CC.Utilization.in.last.12.months_imputed +
## Outstanding.Balance_WoE + No.of.times.30.DPD.or.worse.in.last.6.months +
## No.of.trades.opened.in.last.12.months, family = "binomial",
## data = train_data[, -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6663 -0.3463 -0.2562 -0.1927 2.9200
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -3.4690713 0.0725715
## No.of.months.in.current.residence -0.0011004 0.0006689
## Avgas.CC.Utilization.in.last.12.months_WoE -0.5280311 0.0718502
## Avgas.CC.Utilization.in.last.12.months_imputed 0.0028043 0.0010118
## Outstanding.Balance_WoE -0.2548958 0.0697123
## No.of.times.30.DPD.or.worse.in.last.6.months 0.1778615 0.0200813
## No.of.trades.opened.in.last.12.months 0.0238627 0.0057267
## z value Pr(>|z|)
## (Intercept) -47.802 < 2e-16 ***
## No.of.months.in.current.residence -1.645 0.099945 .
## Avgas.CC.Utilization.in.last.12.months_WoE -7.349 2.00e-13 ***
## Avgas.CC.Utilization.in.last.12.months_imputed 2.772 0.005579 **
## Outstanding.Balance_WoE -3.656 0.000256 ***
## No.of.times.30.DPD.or.worse.in.last.6.months 8.857 < 2e-16 ***
## No.of.trades.opened.in.last.12.months 4.167 3.09e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 17096 on 48862 degrees of freedom
## Residual deviance: 16359 on 48856 degrees of freedom
## AIC: 16373
##
## Number of Fisher Scoring iterations: 6
# AIC: 16368
sort(vif(logistic_model_application_and_creditdata_5),decreasing = TRUE)
## Avgas.CC.Utilization.in.last.12.months_WoE
## 2.729035
## Outstanding.Balance_WoE
## 1.956879
## Avgas.CC.Utilization.in.last.12.months_imputed
## 1.921977
## No.of.trades.opened.in.last.12.months
## 1.429294
## No.of.times.30.DPD.or.worse.in.last.6.months
## 1.307147
## No.of.months.in.current.residence
## 1.104710
# Removing No.of.months.in.current.residence due to high p-value = 0.099945
logistic_model_application_and_creditdata_6 <- glm(formula = Performance ~
Avgas.CC.Utilization.in.last.12.months_WoE +
Avgas.CC.Utilization.in.last.12.months_imputed +
Outstanding.Balance_WoE +
No.of.times.30.DPD.or.worse.in.last.6.months +
No.of.trades.opened.in.last.12.months,
family = "binomial", data = train_data[, -1])
summary(logistic_model_application_and_creditdata_6)
##
## Call:
## glm(formula = Performance ~ Avgas.CC.Utilization.in.last.12.months_WoE +
## Avgas.CC.Utilization.in.last.12.months_imputed + Outstanding.Balance_WoE +
## No.of.times.30.DPD.or.worse.in.last.6.months + No.of.trades.opened.in.last.12.months,
## family = "binomial", data = train_data[, -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6573 -0.3465 -0.2578 -0.1915 2.8857
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -3.511071 0.068084
## Avgas.CC.Utilization.in.last.12.months_WoE -0.514043 0.071294
## Avgas.CC.Utilization.in.last.12.months_imputed 0.002667 0.001008
## Outstanding.Balance_WoE -0.250073 0.069589
## No.of.times.30.DPD.or.worse.in.last.6.months 0.177867 0.020088
## No.of.trades.opened.in.last.12.months 0.024971 0.005687
## z value Pr(>|z|)
## (Intercept) -51.570 < 2e-16 ***
## Avgas.CC.Utilization.in.last.12.months_WoE -7.210 5.59e-13 ***
## Avgas.CC.Utilization.in.last.12.months_imputed 2.645 0.008180 **
## Outstanding.Balance_WoE -3.594 0.000326 ***
## No.of.times.30.DPD.or.worse.in.last.6.months 8.854 < 2e-16 ***
## No.of.trades.opened.in.last.12.months 4.391 1.13e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 17096 on 48862 degrees of freedom
## Residual deviance: 16362 on 48857 degrees of freedom
## AIC: 16374
##
## Number of Fisher Scoring iterations: 6
# AIC: 16374
sort(vif(logistic_model_application_and_creditdata_6),decreasing = TRUE)
## Avgas.CC.Utilization.in.last.12.months_WoE
## 2.690655
## Outstanding.Balance_WoE
## 1.954449
## Avgas.CC.Utilization.in.last.12.months_imputed
## 1.909329
## No.of.trades.opened.in.last.12.months
## 1.408925
## No.of.times.30.DPD.or.worse.in.last.6.months
## 1.305800
# Removing Avgas.CC.Utilization.in.last.12.months_imputed
# Avgas.CC.Utilization.in.last.12.months_WoE - VIF=2.690655, p-value=5.59e-13
# Avgas.CC.Utilization.in.last.12.months_imputed - VIF=1.909329 ,p-value=0.008180
cor(train_data$Avgas.CC.Utilization.in.last.12.months_imputed, train_data$Avgas.CC.Utilization.in.last.12.months_WoE)
## [1] -0.7412454
# -0.7412454
logistic_model_application_and_creditdata_7 <- glm(formula = Performance ~
Avgas.CC.Utilization.in.last.12.months_WoE +
Outstanding.Balance_WoE +
No.of.times.30.DPD.or.worse.in.last.6.months +
No.of.trades.opened.in.last.12.months,
family = "binomial", data = train_data[, -1])
summary(logistic_model_application_and_creditdata_7)
##
## Call:
## glm(formula = Performance ~ Avgas.CC.Utilization.in.last.12.months_WoE +
## Outstanding.Balance_WoE + No.of.times.30.DPD.or.worse.in.last.6.months +
## No.of.trades.opened.in.last.12.months, family = "binomial",
## data = train_data[, -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6415 -0.3463 -0.2530 -0.1922 2.8780
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -3.382956 0.046997 -71.982
## Avgas.CC.Utilization.in.last.12.months_WoE -0.622885 0.057758 -10.784
## Outstanding.Balance_WoE -0.264025 0.069537 -3.797
## No.of.times.30.DPD.or.worse.in.last.6.months 0.172489 0.020023 8.615
## No.of.trades.opened.in.last.12.months 0.020781 0.005475 3.796
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## Avgas.CC.Utilization.in.last.12.months_WoE < 2e-16 ***
## Outstanding.Balance_WoE 0.000147 ***
## No.of.times.30.DPD.or.worse.in.last.6.months < 2e-16 ***
## No.of.trades.opened.in.last.12.months 0.000147 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 17096 on 48862 degrees of freedom
## Residual deviance: 16368 on 48858 degrees of freedom
## AIC: 16378
##
## Number of Fisher Scoring iterations: 6
# AIC: 16378
sort(vif(logistic_model_application_and_creditdata_7),decreasing = TRUE)
## Outstanding.Balance_WoE
## 1.954735
## Avgas.CC.Utilization.in.last.12.months_WoE
## 1.776972
## No.of.trades.opened.in.last.12.months
## 1.297773
## No.of.times.30.DPD.or.worse.in.last.6.months
## 1.292529
# Outstanding.Balance_WoE Avgas.CC.Utilization.in.last.12.months_WoE No.of.trades.opened.in.last.12.months No.of.times.30.DPD.or.worse.in.last.6.months
# 1.954735 1.776972 1.297773 1.292529
logistic_model_application_and_creditdata_unbalanced <- logistic_model_application_and_creditdata_7
test_pred = predict(logistic_model_application_and_creditdata_unbalanced, type = "response",
newdata = test_data)
summary(test_pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01590 0.01915 0.03548 0.04245 0.06008 0.17121
cutOff <- findOptimalCutOff(test_pred, .01,.17)
## Optimal Cutoff = 0.049
# Optimal Cutoff = 0.049
logistic_model_application_and_creditdata_unbalanced_metrics <- evaluateClassificationModel(test_pred,
test_actual_default,
cutOff)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 12717 338
## Yes 7339 545
##
## Accuracy : 0.6334
## 95% CI : (0.6268, 0.6399)
## No Information Rate : 0.9578
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0525
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.61721
## Specificity : 0.63407
## Pos Pred Value : 0.06913
## Neg Pred Value : 0.97411
## Prevalence : 0.04217
## Detection Rate : 0.02603
## Detection Prevalence : 0.37652
## Balanced Accuracy : 0.62564
##
## 'Positive' Class : Yes
##
## Accuracy Sensitivity Specificity F_score Threshold AUC
## Accuracy 0.6333636 0.617214 0.6340746 0.04046375 0.049 0.6256443
## False_positive_Rate True_positive_Rate
## Accuracy 0.3659254 0.617214
rownames(logistic_model_application_and_creditdata_unbalanced_metrics) <- "FullData - GLM - Unbalanced"
model_Metrics <- rbind(model_Metrics, logistic_model_application_and_creditdata_unbalanced_metrics)
# Optimal Cutoff = 0.049
# Accuracy : 0.6334
# Sensitivity : 0.61721
# Specificity : 0.63407
#
# F : 0.04046375
#
# Area under the curve (AUC): 0.6256443
## Logistic Regression - Using Under Sampling
t8 <- Sys.time()
model_glm_fullCustomerData_undersampling <- caret::train(Performance ~
Income_imputed +
No.of.months.in.current.company +
No.of.months.in.current.residence +
Avgas.CC.Utilization.in.last.12.months_WoE +
Avgas.CC.Utilization.in.last.12.months_imputed +
Outstanding.Balance_WoE +
Outstanding.Balance_imputed +
No.of.times.30.DPD.or.worse.in.last.6.months +
No.of.trades.opened.in.last.12.months,
data = train_data [, -1],
method = "glm",
family="binomial",
preProcess = c("scale", "center"),
tuneLength = 5,
trControl = trainControl(method = "cv",
number = 5,
verboseIter = TRUE,
sampling = "down"))
## + Fold1: parameter=none
## - Fold1: parameter=none
## + Fold2: parameter=none
## - Fold2: parameter=none
## + Fold3: parameter=none
## - Fold3: parameter=none
## + Fold4: parameter=none
## - Fold4: parameter=none
## + Fold5: parameter=none
## - Fold5: parameter=none
## Aggregating results
## Fitting final model on full training set
test_pred_fullCustomerData_glm_undersampling <- predict(model_glm_fullCustomerData_undersampling,
type = "prob",
newdata = test_data)
t9 <- Sys.time()
t9-t8
## Time difference of 2.388349 secs
# Time difference of 2.016878 secs
summary(test_pred_fullCustomerData_glm_undersampling[,2])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1994 0.2951 0.4598 0.4584 0.5971 0.8931
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.2164 0.2935 0.4670 0.4590 0.6035 0.8538
cutOff <- findOptimalCutOff(test_pred_fullCustomerData_glm_undersampling[,2], 0.20, 0.85)
## Optimal Cutoff = 0.541
# Optimal Cutoff = 0.541
model_glm_fullCustomerData_undersampling_metrics <- evaluateClassificationModel(test_pred_fullCustomerData_glm_undersampling[,2],
test_actual_default,
cutOff)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 12735 333
## Yes 7321 550
##
## Accuracy : 0.6345
## 95% CI : (0.6279, 0.641)
## No Information Rate : 0.9578
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0539
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.62288
## Specificity : 0.63497
## Pos Pred Value : 0.06988
## Neg Pred Value : 0.97452
## Prevalence : 0.04217
## Detection Rate : 0.02627
## Detection Prevalence : 0.37590
## Balanced Accuracy : 0.62892
##
## 'Positive' Class : Yes
##
## Accuracy Sensitivity Specificity F_score Threshold AUC
## Accuracy 0.634462 0.6228766 0.6349721 0.04046375 0.541 0.6289243
## False_positive_Rate True_positive_Rate
## Accuracy 0.3650279 0.6228766
rownames(model_glm_fullCustomerData_undersampling_metrics) <- "FullData - GLM - Under-Sampling"
model_Metrics <- rbind(model_Metrics, model_glm_fullCustomerData_undersampling_metrics)
# Accuracy : 0.6345
# Sensitivity : 0.62288
# Specificity : 0.06988
#
# F: 0.04046375
#
# Area under the curve (AUC): 0.6289243
# -------------------------------- Logistic Regression - Using Over Sampling
t8 <- Sys.time()
model_glm_fullCustomerData_oversampling <- caret::train(Performance ~
Income_imputed +
No.of.months.in.current.company +
No.of.months.in.current.residence +
Avgas.CC.Utilization.in.last.12.months_WoE +
Avgas.CC.Utilization.in.last.12.months_imputed +
Outstanding.Balance_WoE +
Outstanding.Balance_imputed +
No.of.times.30.DPD.or.worse.in.last.6.months +
No.of.trades.opened.in.last.12.months,
data = train_data [, -1],
method = "glm",
family="binomial",
preProcess = c("scale", "center"),
tuneLength = 5,
trControl = trainControl(method = "cv",
number = 5,
verboseIter = TRUE,
sampling = "up"))
## + Fold1: parameter=none
## - Fold1: parameter=none
## + Fold2: parameter=none
## - Fold2: parameter=none
## + Fold3: parameter=none
## - Fold3: parameter=none
## + Fold4: parameter=none
## - Fold4: parameter=none
## + Fold5: parameter=none
## - Fold5: parameter=none
## Aggregating results
## Fitting final model on full training set
test_pred_fullCustomerData_glm_oversampling <- predict(model_glm_fullCustomerData_oversampling,
type = "prob",
newdata = test_data)
t9 <- Sys.time()
t9-t8
## Time difference of 7.357556 secs
# Time difference of 7.105892 secs
summary(test_pred_fullCustomerData_glm_oversampling[,2])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2206 0.2991 0.4604 0.4581 0.5917 0.8737
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.2224 0.3040 0.4597 0.4587 0.5897 0.8765
cutOff <- findOptimalCutOff(test_pred_fullCustomerData_glm_oversampling[,2], 0.2, 0.87)
## Optimal Cutoff = 0.532
# Optimal Cutoff = 0.532
model_glm_fullCustomerData_oversampling_metrics <- evaluateClassificationModel(test_pred_fullCustomerData_glm_oversampling[,2],
test_actual_default,
cutOff)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 12468 319
## Yes 7588 564
##
## Accuracy : 0.6224
## 95% CI : (0.6158, 0.629)
## No Information Rate : 0.9578
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0528
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.63873
## Specificity : 0.62166
## Pos Pred Value : 0.06919
## Neg Pred Value : 0.97505
## Prevalence : 0.04217
## Detection Rate : 0.02694
## Detection Prevalence : 0.38932
## Balanced Accuracy : 0.63020
##
## 'Positive' Class : Yes
##
## Accuracy Sensitivity Specificity F_score Threshold AUC
## Accuracy 0.6223793 0.6387316 0.6216594 0.04046375 0.532 0.6301955
## False_positive_Rate True_positive_Rate
## Accuracy 0.3783406 0.6387316
rownames(model_glm_fullCustomerData_oversampling_metrics) <- "FullData - GLM - Over-Sampling"
model_Metrics <- rbind(model_Metrics, model_glm_fullCustomerData_oversampling_metrics)
# Accuracy : 0.6264865
# Sensitivity : 0.6375991
# Specificity : 0.6259972
#
# F: 0.04046375
#
# Area under the curve (AUC): 0.6317982
#
# -------------------------------- Logistic Regression - Using SMOTE
t8 <- Sys.time()
model_glm_fullCustomerData_smote <- caret::train(Performance ~
Income_imputed +
No.of.months.in.current.company +
No.of.months.in.current.residence +
Avgas.CC.Utilization.in.last.12.months_WoE +
Avgas.CC.Utilization.in.last.12.months_imputed +
Outstanding.Balance_WoE +
Outstanding.Balance_imputed +
No.of.times.30.DPD.or.worse.in.last.6.months +
No.of.trades.opened.in.last.12.months,
data = train_data [, -1],
method = "glm",
family="binomial",
preProcess = c("scale", "center"),
tuneLength = 5,
trControl = trainControl(method = "cv",
number = 5,
verboseIter = TRUE,
sampling = "smote"))
## + Fold1: parameter=none
## - Fold1: parameter=none
## + Fold2: parameter=none
## - Fold2: parameter=none
## + Fold3: parameter=none
## - Fold3: parameter=none
## + Fold4: parameter=none
## - Fold4: parameter=none
## + Fold5: parameter=none
## - Fold5: parameter=none
## Aggregating results
## Fitting final model on full training set
test_pred_fullCustomerData_glm_smote <- predict(model_glm_fullCustomerData_smote,
type = "prob",
newdata = test_data)
t9 <- Sys.time()
t9-t8
## Time difference of 9.730844 secs
# Time difference of 10.60025 secs
summary(test_pred_fullCustomerData_glm_smote[,2])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1595 0.2487 0.3860 0.3943 0.5191 0.8426
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.1617 0.2449 0.3912 0.3948 0.5260 0.8312
cutOff <- findOptimalCutOff(test_pred_fullCustomerData_glm_smote[,2], 0.16, 0.83)
## Optimal Cutoff = 0.458
# Optimal Cutoff = 0.458
model_glm_fullCustomerData_smotesampling_metrics <- evaluateClassificationModel(test_pred_fullCustomerData_glm_smote[,2],
test_actual_default,
cutOff)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 12568 319
## Yes 7488 564
##
## Accuracy : 0.6272
## 95% CI : (0.6206, 0.6337)
## No Information Rate : 0.9578
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0544
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.63873
## Specificity : 0.62665
## Pos Pred Value : 0.07004
## Neg Pred Value : 0.97525
## Prevalence : 0.04217
## Detection Rate : 0.02694
## Detection Prevalence : 0.38455
## Balanced Accuracy : 0.63269
##
## 'Positive' Class : Yes
##
## Accuracy Sensitivity Specificity F_score Threshold AUC
## Accuracy 0.6271551 0.6387316 0.6266454 0.04046375 0.458 0.6326885
## False_positive_Rate True_positive_Rate
## Accuracy 0.3733546 0.6387316
rownames(model_glm_fullCustomerData_smotesampling_metrics) <- "FullData - GLM - SMOTE-Sampling"
model_Metrics <- rbind(model_Metrics, model_glm_fullCustomerData_smotesampling_metrics)
# Accuracy : 0.6270118
# Sensitivity : 0.6398641
# Specificity : 0.626446
#
# F: 0.04046375
#
# Area under the curve (AUC): 0.6326885
# -------------------------------- Decision Tree - Using Under-Sampling
# Accuracy Sensitivity Specificity F_score Threshold AUC False_positive_Rate True_positive_Rate
# Accuracy 0.05372749 0.9852775 0.0127144 0.04046375 0.05 0.5010041 0.9852775 0.9872856
# Discarding this Under-sampling options for Decision Tree
# -------------------------------- Decision Tree - Using Over-Sampling
# Accuracy Sensitivity Specificity F_score Threshold AUC False_positive_Rate True_positive_Rate
# Accuracy 0.8581594 0.1313703 0.8901576 0.04046375 0.05 0.5107639 0.1098424 0.1313703
# Discarding this over-sampling options for Decision Tree
# -------------------------------- Decision Tree - Using SMOTE
t8 <- Sys.time()
model_rpart_fullCustomerData_smote <- caret::train(Performance ~
Income_imputed +
No.of.months.in.current.company +
No.of.months.in.current.residence +
#Avgas.CC.Utilization.in.last.12.months_WoE +
Avgas.CC.Utilization.in.last.12.months_imputed +
#Outstanding.Balance_WoE +
Outstanding.Balance_imputed +
No.of.times.30.DPD.or.worse.in.last.6.months +
No.of.trades.opened.in.last.12.months,
data = train_data [, -1],
method = "rpart",
#preProcess = c("scale", "center"),
#minsplit=30, minbucket = 15, cp=0.0001,
tuneLength = 5,
trControl = trainControl(method = "cv",
number = 5,
verboseIter = TRUE,
sampling = "smote"))
## + Fold1: cp=9.695e-05
## - Fold1: cp=9.695e-05
## + Fold2: cp=9.695e-05
## - Fold2: cp=9.695e-05
## + Fold3: cp=9.695e-05
## - Fold3: cp=9.695e-05
## + Fold4: cp=9.695e-05
## - Fold4: cp=9.695e-05
## + Fold5: cp=9.695e-05
## - Fold5: cp=9.695e-05
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.000388 on full training set
# Fitting cp = 0.000388 on full training set
t9 <- Sys.time()
t9-t8
## Time difference of 10.41869 secs
# Time difference of 16.57429 secs
# # plot(model_rpart_fullCustomerData_smote)
# # prp(model_rpart_fullCustomerData_smote, box.palette = "Reds", tweak = 1.2)
# library(rpart.plot)
# rpart.plot(model_rpart_fullCustomerData_smote$finalModel)
test_pred_fullCustomerData_rpart_smote <- predict(model_rpart_fullCustomerData_smote,
type = "prob",
newdata = test_data)
summary(test_pred_fullCustomerData_rpart_smote[,2])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.1289 0.1782 0.2690 0.2766 1.0000
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.0000 0.1083 0.1765 0.2533 0.2500 1.0000
cutOff <- findOptimalCutOff(test_pred_fullCustomerData_rpart_smote[,2], 0.1, 0.25)
## Optimal Cutoff = 0.191
# Optimal Cutoff = 0.206
model_rpart_fullCustomerData_smotesampling_metrics <- evaluateClassificationModel(test_pred_fullCustomerData_rpart_smote[,2],
test_actual_default,
cutOff)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 11706 362
## Yes 8350 521
##
## Accuracy : 0.5839
## 95% CI : (0.5772, 0.5906)
## No Information Rate : 0.9578
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0326
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.59003
## Specificity : 0.58367
## Pos Pred Value : 0.05873
## Neg Pred Value : 0.97000
## Prevalence : 0.04217
## Detection Rate : 0.02488
## Detection Prevalence : 0.42366
## Balanced Accuracy : 0.58685
##
## 'Positive' Class : Yes
##
## Accuracy Sensitivity Specificity F_score Threshold AUC
## Accuracy 0.5839343 0.590034 0.5836657 0.04046375 0.191 0.5868499
## False_positive_Rate True_positive_Rate
## Accuracy 0.4163343 0.590034
rownames(model_rpart_fullCustomerData_smotesampling_metrics) <- "FullData - RPART - SMOTE-Sampling"
model_Metrics <- rbind(model_Metrics, model_rpart_fullCustomerData_smotesampling_metrics)
# Accuracy : 0.6015569
# Sensitivity : 0.599094
# Specificity : 0.6016653
#
# F: 0.04046375
#
# Area under the curve (AUC): 0.6003797
# -------------------------------- Random Forest - Using Under-Sampling
# Remove code snippet related to this model
# Accuracy Sensitivity Specificity F_score Threshold AUC False_positive_Rate True_positive_Rate
# Accuracy 0.6067147 0.6070215 0.6067012 0.04046375 0.522 0.6068614 0.3932988 0.6070215
# Discarding this Under-sampling options for RF
# -------------------------------- Random Forest - Using Over-Sampling
# Remove code snippet related to this model
# Accuracy Sensitivity Specificity F_score Threshold AUC False_positive_Rate True_positive_Rate
# Accuracy 0.5677444 0.5934315 0.5666135 0.04046375 0.05 0.5800225 0.4333865 0.5934315
# Also takes 50+ mins to build model. So, discarding RF Over-Sampling option
# Discarding this Over-sampling options for RF
# -------------------------------- Random Forest - Using SMOTE
t12 <- Sys.time()
model_rf_fullCustomerData_DemographicData_smote <- caret::train(Performance ~
Income_imputed +
No.of.months.in.current.company +
No.of.months.in.current.residence +
Avgas.CC.Utilization.in.last.12.months_WoE +
Avgas.CC.Utilization.in.last.12.months_imputed +
Outstanding.Balance_WoE +
Outstanding.Balance_imputed +
No.of.times.30.DPD.or.worse.in.last.6.months +
No.of.trades.opened.in.last.12.months,
data = train_data [, -1],
method = "rf",
ntree = 1000,
preProcess = c("scale", "center"),
trControl = trainControl(method = "cv",
number = 5,
verboseIter = TRUE,
sampling = "smote"))
## + Fold1: mtry=2
## - Fold1: mtry=2
## + Fold1: mtry=5
## - Fold1: mtry=5
## + Fold1: mtry=9
## - Fold1: mtry=9
## + Fold2: mtry=2
## - Fold2: mtry=2
## + Fold2: mtry=5
## - Fold2: mtry=5
## + Fold2: mtry=9
## - Fold2: mtry=9
## + Fold3: mtry=2
## - Fold3: mtry=2
## + Fold3: mtry=5
## - Fold3: mtry=5
## + Fold3: mtry=9
## - Fold3: mtry=9
## + Fold4: mtry=2
## - Fold4: mtry=2
## + Fold4: mtry=5
## - Fold4: mtry=5
## + Fold4: mtry=9
## - Fold4: mtry=9
## + Fold5: mtry=2
## - Fold5: mtry=2
## + Fold5: mtry=5
## - Fold5: mtry=5
## + Fold5: mtry=9
## - Fold5: mtry=9
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set
t13 <- Sys.time()
t13-t12
## Time difference of 5.685651 mins
# Time difference of 4.712344 mins
test_pred_fullCustomerData_rf_smote <- predict(model_rf_fullCustomerData_DemographicData_smote,
type = "prob",
newdata = test_data)
summary(test_pred_fullCustomerData_rf_smote[,2])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0990 0.2230 0.2459 0.3670 0.8390
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.0010 0.0980 0.2240 0.2435 0.3650 0.8620
cutOff <- findOptimalCutOff(test_pred_fullCustomerData_rf_smote[,2], .001,.86)
## Optimal Cutoff = 0.279
# Optimal Cutoff = 0.279
model_rf_fullCustomerData_smotesampling_metrics <- evaluateClassificationModel(test_pred_fullCustomerData_rf_smote[,2],
test_actual_default,
cutOff)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 12180 352
## Yes 7876 531
##
## Accuracy : 0.607
## 95% CI : (0.6004, 0.6137)
## No Information Rate : 0.9578
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0411
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.60136
## Specificity : 0.60730
## Pos Pred Value : 0.06316
## Neg Pred Value : 0.97191
## Prevalence : 0.04217
## Detection Rate : 0.02536
## Detection Prevalence : 0.40150
## Balanced Accuracy : 0.60433
##
## 'Positive' Class : Yes
##
## Accuracy Sensitivity Specificity F_score Threshold AUC
## Accuracy 0.607049 0.601359 0.6072996 0.04046375 0.279 0.6043293
## False_positive_Rate True_positive_Rate
## Accuracy 0.3927004 0.601359
rownames(model_rf_fullCustomerData_smotesampling_metrics) <- "FullData - RF - SMOTE-Sampling"
model_Metrics <- rbind(model_Metrics, model_rf_fullCustomerData_smotesampling_metrics)
plot(varImp(object=model_rf_fullCustomerData_DemographicData_smote),main="Random Forest (SMOTE) - Variable Importance")
# Accuracy : 0.6212
# Sensitivity : 0.62288
# Specificity : 0.62111
#
# F: 0.040
#
# Area under the curve (AUC): 0.622
# Evaluate various metrics across vall models built
# Evaluating based on AUC, F-Score, Sensitivity, Specificity and Accuracy
View(model_Metrics)
# Analyse Lift, Gain and KS-Statistic metrics
model_Metrics$KSStatistic [1] <- GainLiftChart_KSStatistic(logistic_model_demographic_data_unbalanced, test_data, "response")
## # A tibble: 10 x 6
## bucket total totalresp Cumresp Gain Cumlift
## <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 2094 131 131 14.83579 1.483579
## 2 2 2094 103 234 26.50057 1.325028
## 3 3 2094 105 339 38.39185 1.279728
## 4 4 2094 88 427 48.35787 1.208947
## 5 5 2094 102 529 59.90940 1.198188
## 6 6 2094 70 599 67.83692 1.130615
## 7 7 2094 88 687 77.80294 1.111471
## 8 8 2094 69 756 85.61721 1.070215
## 9 9 2094 59 815 92.29898 1.025544
## 10 10 2093 68 883 100.00000 1.000000
model_Metrics$Lift [1] <- 1.1
model_Metrics$Gain [1] <- 59.91
model_Metrics$KSStatistic [2] <- GainLiftChart_KSStatistic(logistic_model_application_and_creditdata_unbalanced, test_data, "response")
## # A tibble: 10 x 6
## bucket total totalresp Cumresp Gain Cumlift
## <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 2094 181 181 20.49830 2.049830
## 2 2 2094 149 330 37.37259 1.868630
## 3 3 2094 135 465 52.66138 1.755379
## 4 4 2094 109 574 65.00566 1.625142
## 5 5 2094 88 662 74.97169 1.499434
## 6 6 2094 73 735 83.23896 1.387316
## 7 7 2094 48 783 88.67497 1.266785
## 8 8 2094 50 833 94.33749 1.179219
## 9 9 2094 32 865 97.96149 1.088461
## 10 10 2093 18 883 100.00000 1.000000
model_Metrics$Lift [2] <- 1.49
model_Metrics$Gain [2] <- 74.97
model_Metrics$KSStatistic [3] <- GainLiftChart_KSStatistic(model_glm_fullCustomerData_undersampling, test_data, "raw")
## # A tibble: 10 x 6
## bucket total totalresp Cumresp Gain Cumlift
## <int> <int> <int> <int> <dbl> <dbl>
## 1 1 2094 124 124 14.04304 1.404304
## 2 2 2094 129 253 28.65232 1.432616
## 3 3 2094 148 401 45.41336 1.513779
## 4 4 2094 138 539 61.04190 1.526048
## 5 5 2094 95 634 71.80068 1.436014
## 6 6 2094 47 681 77.12344 1.285391
## 7 7 2094 50 731 82.78596 1.182657
## 8 8 2094 48 779 88.22197 1.102775
## 9 9 2094 50 829 93.88448 1.043161
## 10 10 2093 54 883 100.00000 1.000000
model_Metrics$Lift [3] <- 1.43
model_Metrics$Gain [3] <- 71.80
model_Metrics$KSStatistic [4] <- GainLiftChart_KSStatistic(model_glm_fullCustomerData_oversampling, test_data, "raw")
## # A tibble: 10 x 6
## bucket total totalresp Cumresp Gain Cumlift
## <int> <int> <int> <int> <dbl> <dbl>
## 1 1 2094 125 125 14.15629 1.415629
## 2 2 2094 133 258 29.21857 1.460929
## 3 3 2094 148 406 45.97961 1.532654
## 4 4 2094 140 546 61.83465 1.545866
## 5 5 2094 90 636 72.02718 1.440544
## 6 6 2094 46 682 77.23669 1.287278
## 7 7 2094 48 730 82.67271 1.181039
## 8 8 2094 48 778 88.10872 1.101359
## 9 9 2094 49 827 93.65798 1.040644
## 10 10 2093 56 883 100.00000 1.000000
model_Metrics$Lift [4] <- 1.44
model_Metrics$Gain [4] <- 72.23
model_Metrics$KSStatistic [5] <-GainLiftChart_KSStatistic(model_glm_fullCustomerData_smote, test_data, "raw")
## # A tibble: 10 x 6
## bucket total totalresp Cumresp Gain Cumlift
## <int> <int> <int> <int> <dbl> <dbl>
## 1 1 2094 132 132 14.94904 1.494904
## 2 2 2094 163 295 33.40883 1.670442
## 3 3 2094 153 448 50.73613 1.691204
## 4 4 2094 56 504 57.07814 1.426954
## 5 5 2094 59 563 63.75991 1.275198
## 6 6 2094 54 617 69.87542 1.164590
## 7 7 2094 69 686 77.68969 1.109853
## 8 8 2094 63 749 84.82446 1.060306
## 9 9 2094 69 818 92.63873 1.029319
## 10 10 2093 65 883 100.00000 1.000000
model_Metrics$Lift [5] <- 1.27
model_Metrics$Gain [5] <- 63.75
model_Metrics$KSStatistic [6] <-GainLiftChart_KSStatistic(model_rpart_fullCustomerData_smote, test_data, "raw")
## # A tibble: 10 x 6
## bucket total totalresp Cumresp Gain Cumlift
## <int> <int> <int> <int> <dbl> <dbl>
## 1 1 2094 149 149 16.87429 1.687429
## 2 2 2094 108 257 29.10532 1.455266
## 3 3 2094 72 329 37.25934 1.241978
## 4 4 2094 78 407 46.09287 1.152322
## 5 5 2094 73 480 54.36014 1.087203
## 6 6 2094 88 568 64.32616 1.072103
## 7 7 2094 73 641 72.59343 1.037049
## 8 8 2094 78 719 81.42695 1.017837
## 9 9 2094 77 796 90.14723 1.001636
## 10 10 2093 87 883 100.00000 1.000000
model_Metrics$Lift [6] <- 1.08
model_Metrics$Gain [6] <- 54.36
model_Metrics$KSStatistic [7] <- GainLiftChart_KSStatistic(model_rf_fullCustomerData_DemographicData_smote, test_data, "raw")
## # A tibble: 10 x 6
## bucket total totalresp Cumresp Gain Cumlift
## <int> <int> <int> <int> <dbl> <dbl>
## 1 1 2094 140 140 15.85504 1.5855040
## 2 2 2094 66 206 23.32956 1.1664779
## 3 3 2094 86 292 33.06908 1.1023028
## 4 4 2094 71 363 41.10985 1.0277463
## 5 5 2094 93 456 51.64213 1.0328426
## 6 6 2094 83 539 61.04190 1.0173650
## 7 7 2094 87 626 70.89468 1.0127811
## 8 8 2094 83 709 80.29445 1.0036806
## 9 9 2094 83 792 89.69422 0.9966025
## 10 10 2093 91 883 100.00000 1.0000000
model_Metrics$Lift [7] <- 1.03
model_Metrics$Gain [7] <- 51.64
# KS-Statistic = 0.0643504
0.0643504
## [1] 0.0643504
View(model_Metrics)
print(model_Metrics)
## Accuracy Sensitivity Specificity
## DemographicData - GLM - Unbalanced 0.5397583 0.5537939 0.5391404
## FullData - GLM - Unbalanced 0.6333636 0.6172140 0.6340746
## FullData - GLM - Under-Sampling 0.6344620 0.6228766 0.6349721
## FullData - GLM - Over-Sampling 0.6223793 0.6387316 0.6216594
## FullData - GLM - SMOTE-Sampling 0.6271551 0.6387316 0.6266454
## FullData - RPART - SMOTE-Sampling 0.5839343 0.5900340 0.5836657
## FullData - RF - SMOTE-Sampling 0.6070490 0.6013590 0.6072996
## F_score Threshold AUC
## DemographicData - GLM - Unbalanced 0.04046375 0.042 0.5464671
## FullData - GLM - Unbalanced 0.04046375 0.049 0.6256443
## FullData - GLM - Under-Sampling 0.04046375 0.541 0.6289243
## FullData - GLM - Over-Sampling 0.04046375 0.532 0.6301955
## FullData - GLM - SMOTE-Sampling 0.04046375 0.458 0.6326885
## FullData - RPART - SMOTE-Sampling 0.04046375 0.191 0.5868499
## FullData - RF - SMOTE-Sampling 0.04046375 0.279 0.6043293
## False_positive_Rate
## DemographicData - GLM - Unbalanced 0.4608596
## FullData - GLM - Unbalanced 0.3659254
## FullData - GLM - Under-Sampling 0.3650279
## FullData - GLM - Over-Sampling 0.3783406
## FullData - GLM - SMOTE-Sampling 0.3733546
## FullData - RPART - SMOTE-Sampling 0.4163343
## FullData - RF - SMOTE-Sampling 0.3927004
## True_positive_Rate KSStatistic
## DemographicData - GLM - Unbalanced 0.5537939 0.1073425
## FullData - GLM - Unbalanced 0.6172140 0.2666422
## FullData - GLM - Under-Sampling 0.6228766 0.2532234
## FullData - GLM - Over-Sampling 0.6387316 0.2568917
## FullData - GLM - SMOTE-Sampling 0.6387316 0.2153007
## FullData - RPART - SMOTE-Sampling 0.5900340 0.1127819
## FullData - RF - SMOTE-Sampling 0.6013590 0.0643504
## Lift Gain
## DemographicData - GLM - Unbalanced 1.10 59.91
## FullData - GLM - Unbalanced 1.49 74.97
## FullData - GLM - Under-Sampling 1.43 71.80
## FullData - GLM - Over-Sampling 1.44 72.23
## FullData - GLM - SMOTE-Sampling 1.27 63.75
## FullData - RPART - SMOTE-Sampling 1.08 54.36
## FullData - RF - SMOTE-Sampling 1.03 51.64
# Top 2 Models Selected
# Note :
# -----
# 1) Discarding Random Forest as it involves high computational resources,
# and also not providing any better formance
# 2) Discarding GLM/Unbalanced with Full Data, as well because it is trained with unbalanced data
# FullData - GLM - SMOTE-Sampling
# FullData - GLM - Over-Sampling
#
# Discarding GLM/Unbalanced model though it has highest KS-Statistic value = 0.2666422 as it is based on Unabalanced data
# GLM/Over-Sampling model has better KS-Statistic value = 0.2568917 than GLM/SMOTE model KS-Statistic value = 0.2153007
#final_Model_For_Scorecard <- model_glm_fullCustomerData_oversampling
#final_Model_For_Scorecard$finalModel
final_Model_For_Scorecard <- logistic_model_application_and_creditdata_unbalanced
# Generalized Linear Model
#
# 48863 samples
# 9 predictor
# 2 classes: '0', '1'
#
# Pre-processing: scaled (9), centered (9)
# Resampling: Cross-Validated (5 fold)
# Summary of sample sizes: 39090, 39091, 39091, 39090, 39090
# Addtional sampling using up-sampling prior to pre-processing
#
# Resampling results:
#
# Accuracy Kappa
# 0.5782698 0.04827151
# Coefficients:
# (Intercept) Income_imputed
# -0.00420 -0.01460
# No.of.months.in.current.company No.of.months.in.current.residence
# -0.03083 -0.06005
# Avgas.CC.Utilization.in.last.12.months_WoE Avgas.CC.Utilization.in.last.12.months_imputed
# -0.27858 0.11162
# Outstanding.Balance_WoE Outstanding.Balance_imputed
# -0.12984 -0.02812
# No.of.times.30.DPD.or.worse.in.last.6.months No.of.trades.opened.in.last.12.months
# 0.22748 0.14213
#
# Degrees of Freedom: 93599 Total (i.e. Null); 93590 Residual
# Null Deviance: 129800
# Residual Deviance: 120700 AIC: 120700
App_Scorecard <- function(model,testdataset){
m <- model
score_data <- testdataset
score_data$bad <- predict(m,type="response",newdata = score_data[,-12])
score_data$good <- (1- score_data$bad)
score_data$odds <- score_data$good/score_data$bad
score_data$logodds <- log(score_data$odds)
points0 = 400
odds0 = 10
pdo = 20
factor = pdo / log(2)
offset = points0 - factor * log( odds0 )
score_data$Score <- offset + factor * score_data$logodds
return(score_data)
}
testdata_scorecard <- App_Scorecard(final_Model_For_Scorecard,test_data)
rejecteddata_scorecard <- App_Scorecard(final_Model_For_Scorecard,rejected_records)
#Optimal Cutoff = 0.049 - for the unbalanced model
points0 = 400
odds0 = 10
pdo = 20
factor = pdo / log(2)
offset = points0 - factor * log( odds0 )
cutoff_prob_from_model = .049
cutoff_logodd <- log((1-cutoff_prob_from_model)/cutoff_prob_from_model)
cutoff_score <- offset + factor * cutoff_logodd
cutoff_score
## [1] 419.1333
#419.33
## rejected data analysis
nrow(rejecteddata_scorecard[(rejecteddata_scorecard$Score >= cutoff_score),])
## [1] 55
#55
boxplot(rejecteddata_scorecard$Score)
## With this build we would have got 55 good customers who had been rejected.
## Full data analysis
fulldata <- customer_master_data
fulldata_scorecard <- App_Scorecard(final_Model_For_Scorecard,fulldata)
ggplot(fulldata_scorecard,aes(fulldata_scorecard$Score,fill=fulldata_scorecard$Performance))+ geom_histogram(binwidth = 10,colour="black")
fulldata_scorecard$predict_performance <- ifelse(fulldata_scorecard$bad>=0.049,1,0)
fulldata_scorecard$iswrong <- ifelse(fulldata_scorecard$predict_performance != fulldata_scorecard$Performance,1,0)
percent_of_wrongprediction <- (sum(fulldata_scorecard$iswrong)/nrow(fulldata_scorecard)) * 100
percent_of_wrongprediction
## [1] 36.40154
##------------------- expected credit loss-------------------------S
#Expected loss(c1) = PD * EAD * LGD
#PD = Probability of default of each customer
#EAD = Exposure at default or oustanding
#LGD = Loss given default.
#Lets assume if recovery likelihood is 30% then LDG = 1 - 0.30 = 0.7
#Total loss expected if all customers are bad
fulldata_scorecard$expected_loss = fulldata_scorecard$bad * fulldata_scorecard$Outstanding.Balance_imputed * 0.7
# Calculated on Full data
# Total prospect loss = 2634047450
# (Prob of bad * Exposure at default * Loss given default)
# Expected loss by default customer from model 147718048
#
# The loss amount of 147718048 can be straight away avoided by not giving loan to default customer prospects
# However, by looking into the application score card, some customers of default category can be consider at medium risk because they fall in the boundary range.
# This potential credit loss can be minimized by target those customer, which Credit Score falls within Good and Intermediate.
# The verification / acquisition cost of Bad Customer can be minimized by this Model
#Creating a dataframe for loss calculation
potential_credit_loss <- fulldata_scorecard[, c("Performance", "bad","Outstanding.Balance_imputed")]
#Subsetting for the defaulted customers
loss_default_customer <- potential_credit_loss[(potential_credit_loss$Performance == 1),]
#Loss if the model is being used on the defaulted customer
loss_default_customer$loss_model <- as.integer(loss_default_customer$bad * loss_default_customer$Outstanding.Balance_imputed * 0.7)
#Calculating the total expected loss and the loss with the model.
total_expected_loss = sum(fulldata_scorecard$expected_loss)
total_extected_loss_default_cust <- sum(loss_default_customer$loss_model)
print(total_expected_loss)
## [1] 2634047450
print(total_extected_loss_default_cust)
## [1] 147718048
#auto rejection rate
auto_rejection_rate <- sum(fulldata_scorecard$predict_performance)/nrow(fulldata_scorecard)
auto_approval_rate = 1 - auto_rejection_rate
auto_approval_rate
## [1] 0.6264434
# Auto approval rate is 62.64%
# Rejected data analysis
#Number of good customers that is being rejected
nrow(rejecteddata_scorecard[(rejecteddata_scorecard$Score >= cutoff_score),])
## [1] 55
#55
boxplot(rejecteddata_scorecard$Score)
# The histograms plots indicates that the number of defaulters decreases after Cut-off Score of 419
# Even though 419 is boundary value with Good and Bad Customers, we can suggest that the boundary range of customers fall between Good and Bad.
rejecteddata_scorecard$expected_loss = rejecteddata_scorecard$bad * rejecteddata_scorecard$Outstanding.Balance_imputed * 0.7
rejecteddata_loss <- rejecteddata_scorecard[,c("Score","bad","Outstanding.Balance_imputed")]
rejecteddata_cutoff_score <- rejecteddata_loss[(rejecteddata_loss$Score >= 419),]
loss_by_rejected_good_customer <- sum(rejecteddata_cutoff_score$Outstanding.Balance_imputed * 0.7)
# Total prospect loss = 96026810
# (Loss because of the full rejected data)
#
# Loss due of Rejection of Good customers is 43876837
#
# The amount of 43876837 would have been gained on using the model because it was the loss by rejection the good customers
rejecteddata_expected_loss <- sum(rejecteddata_scorecard$expected_loss)
print(rejecteddata_expected_loss)
## [1] 96026810
# 96026810
print(loss_by_rejected_good_customer)
## [1] 43876837
# 43876837